diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..aec0f3daa33e5ec96a834ec39f8ceccba5ecc079 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+stable-diffusion.cpp/ggml/examples/mnist/models/mnist/mnist_model.state_dict filter=lfs diff=lfs merge=lfs -text
+stable-diffusion.cpp/ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
diff --git a/stable-diffusion.cpp/.dockerignore b/stable-diffusion.cpp/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..64a58a7815f2dca777d36ed512ab9c3c454624e8
--- /dev/null
+++ b/stable-diffusion.cpp/.dockerignore
@@ -0,0 +1,6 @@
+build*/
+test/
+
+.cache/
+*.swp
+models/
\ No newline at end of file
diff --git a/stable-diffusion.cpp/.github/workflows/build.yml b/stable-diffusion.cpp/.github/workflows/build.yml
new file mode 100644
index 0000000000000000000000000000000000000000..50e6a92275affffaa3503ec6e30e6984785df7c2
--- /dev/null
+++ b/stable-diffusion.cpp/.github/workflows/build.yml
@@ -0,0 +1,201 @@
+name: CI
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
+  push:
+    branches:
+      - master
+      - ci
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+jobs:
+  ubuntu-latest-cmake:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+
+      #- name: Test
+        #id: cmake_test
+        #run: |
+          #cd build
+          #ctest --verbose --timeout 900
+
+  macOS-latest-cmake:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+
+      #- name: Test
+        #id: cmake_test
+        #run: |
+          #cd build
+          #ctest --verbose --timeout 900
+
+  windows-latest-cmake:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        include:
+          - build: 'noavx'
+            defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+          - build: 'avx2'
+            defines: '-DGGML_AVX2=ON'
+          - build: 'avx'
+            defines: '-DGGML_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DGGML_AVX512=ON'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. ${{ matrix.defines }}
+          cmake --build . --config Release
+
+      - name: Check AVX512F support
+        id: check_avx512f
+        if: ${{ matrix.build == 'avx512' }}
+        continue-on-error: true
+        run: |
+          cd build
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
+          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
+          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
+          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
+
+      #- name: Test
+        #id: cmake_test
+        #run: |
+          #cd build
+          #ctest -C Release --verbose --timeout 900
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
+          Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+
+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - ubuntu-latest-cmake
+      - macOS-latest-cmake
+      - windows-latest-cmake
+
+    steps:
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v3
+
+      - name: Get commit hash
+        id: commit
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Create release
+        id: create_release
+        uses: anzz1/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/${file}`)
+                });
+              }
+            }
diff --git a/stable-diffusion.cpp/.gitignore b/stable-diffusion.cpp/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..59a8a2cabaeb936d565790be5036000088c842d5
--- /dev/null
+++ b/stable-diffusion.cpp/.gitignore
@@ -0,0 +1,5 @@
+build*/
+test/
+
+.cache/
+*.swp
diff --git a/stable-diffusion.cpp/.gitmodules b/stable-diffusion.cpp/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..cc639feeee2cd87ffc3d9834b7a70480b18986f3
--- /dev/null
+++ b/stable-diffusion.cpp/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "ggml"]
+	path = ggml
+	url = https://github.com/leejet/ggml.git
diff --git a/stable-diffusion.cpp/CMakeLists.txt b/stable-diffusion.cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13d0a1ce0cb78fc8648cf819fe1e3f4341e47d4b
--- /dev/null
+++ b/stable-diffusion.cpp/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.12)
+project("stable-diffusion")
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(SD_STANDALONE ON)
+else()
+    set(SD_STANDALONE OFF)
+endif()
+
+#
+# Option list
+#
+
+# general
+#option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
+option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
+option(BUILD_SHARED_LIBS             "sd: build shared libs" OFF)
+#option(SD_BUILD_SERVER               "sd: build server example"                           ON)
+
+
+# deps
+add_subdirectory(ggml)
+
+set(SD_LIB stable-diffusion)
+
+add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp)
+target_link_libraries(${SD_LIB} PUBLIC ggml)
+target_include_directories(${SD_LIB} PUBLIC .)
+target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
+
+
+if (SD_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif()
+
diff --git a/stable-diffusion.cpp/Dockerfile b/stable-diffusion.cpp/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..bd9a378f02df238112b96d29d67b1a76d4e2472b
--- /dev/null
+++ b/stable-diffusion.cpp/Dockerfile
@@ -0,0 +1,17 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && apt-get install -y build-essential git cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
\ No newline at end of file
diff --git a/stable-diffusion.cpp/LICENSE b/stable-diffusion.cpp/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..56e1e5a63852c3bcd5ac4fcdfecb7666f5274169
--- /dev/null
+++ b/stable-diffusion.cpp/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 leejet
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/stable-diffusion.cpp/README.md b/stable-diffusion.cpp/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..34aca48db75256883fc0eedbcbeaff019b934994
--- /dev/null
+++ b/stable-diffusion.cpp/README.md
@@ -0,0 +1,198 @@
+<p align="center">
+  <img src="./assets/a%20lovely%20cat.png" width="256x">
+</p>
+
+# stable-diffusion.cpp
+
+Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
+
+## Features
+
+- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- 16-bit, 32-bit float support
+- 4-bit, 5-bit and 8-bit integer quantization support
+- Accelerated memory-efficient CPU inference
+    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image
+- AVX, AVX2 and AVX512 support for x86 architectures
+- SD1.x and SD2.x support
+- Original `txt2img` and `img2img` mode
+- Negative prompt
+- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
+- Sampling method
+    - `Euler A`
+    - `Euler`
+    - `Heun`
+    - `DPM2`
+    - `DPM++ 2M`
+    - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
+    - `DPM++ 2S a`
+- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Embedds generation parameters into png output as webui-compatible text string
+- Supported platforms
+    - Linux
+    - Mac OS
+    - Windows
+    - Android (via Termux)
+
+### TODO
+
+- [ ] More sampling methods
+- [ ] GPU support
+- [ ] Make inference faster
+    - The current implementation of ggml_conv_2d is slow and has high memory usage
+- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
+- [ ] LoRA support
+- [ ] k-quants support
+
+## Usage
+
+### Get the Code
+
+```
+git clone --recursive https://github.com/leejet/stable-diffusion.cpp
+cd stable-diffusion.cpp
+```
+
+- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
+
+```
+cd stable-diffusion.cpp
+git pull origin master
+git submodule init
+git submodule update
+```
+
+### Convert weights
+
+- download original weights(.ckpt or .safetensors). For example
+    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
+    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
+    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
+
+    ```shell
+    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
+    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
+    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-nonema-pruned.safetensors
+    ```
+
+- convert weights to ggml model format
+
+    ```shell
+    cd models
+    pip install -r requirements.txt
+    python convert.py [path to weights] --out_type [output precision]
+    # For example, python convert.py sd-v1-4.ckpt --out_type f16
+    ```
+
+### Quantization
+
+You can specify the output model format using the --out_type parameter
+
+- `f16` for 16-bit floating-point
+- `f32` for 32-bit floating-point
+- `q8_0` for 8-bit integer quantization 
+- `q5_0` or `q5_1` for 5-bit integer quantization 
+- `q4_0` or `q4_1` for 4-bit integer quantization
+
+### Build
+
+#### Build from scratch
+
+```shell
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Release
+```
+
+##### Using OpenBLAS
+
+```
+cmake .. -DGGML_OPENBLAS=ON
+cmake --build . --config Release
+```
+
+### Run
+
+```
+usage: ./bin/sd [arguments]
+
+arguments:
+  -h, --help                         show this help message and exit
+  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)
+  -t, --threads N                    number of threads to use during computation (default: -1).
+                                     If threads <= 0, then threads will be set to the number of CPU physical cores
+  -m, --model [MODEL]                path to model
+  -i, --init-img [IMAGE]             path to the input image, required by img2img
+  -o, --output OUTPUT                path to write result image to (default: .\output.png)
+  -p, --prompt [PROMPT]              the prompt to render
+  -n, --negative-prompt PROMPT       the negative prompt (default: "")
+  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
+  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
+                                     1.0 corresponds to full destruction of information in init image
+  -H, --height H                     image height, in pixel space (default: 512)
+  -W, --width W                      image width, in pixel space (default: 512)
+  --sampling-method {euler, euler_a, heun, dpm++2m, dpm++2mv2}
+                                     sampling method (default: "euler_a")
+  --steps  STEPS                     number of sample steps (default: 20)
+  --rng {std_default, cuda}          RNG (default: cuda)
+  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
+  -v, --verbose                      print extra info
+```
+
+#### txt2img example
+
+```
+./bin/sd -m ../models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat"
+```
+
+Using formats of different precisions will yield results of varying quality.
+
+| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
+| ----  |----  |----  |----  |----  |----  |----  |
+| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
+
+#### img2img example
+
+- `./output.png` is the image generated from the above txt2img pipeline
+
+
+```
+./bin/sd --mode img2img -m ../models/sd-v1-4-ggml-model-f16.bin -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+```
+
+<p align="center">
+  <img src="./assets/img2img_output.png" width="256x">
+</p>
+
+### Docker
+
+#### Building using Docker
+
+```shell
+docker build -t sd .
+```
+
+#### Run
+
+```shell
+docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
+# For example
+# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat" -v -o /output/output.png
+```
+
+## Memory/Disk Requirements
+
+| precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
+| ----         | ----  |----  |----  |----  |----  |----  |----  |
+|  **Disk**        | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
+|  **Memory**(txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
+
+
+## References
+
+- [ggml](https://github.com/ggerganov/ggml)
+- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
+- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
+- [k-diffusion](https://github.com/crowsonkb/k-diffusion)
diff --git a/stable-diffusion.cpp/assets/a lovely cat.png b/stable-diffusion.cpp/assets/a lovely cat.png
new file mode 100644
index 0000000000000000000000000000000000000000..6aa65196be1d1954586d063b2d93fbacdbaf7fbc
Binary files /dev/null and b/stable-diffusion.cpp/assets/a lovely cat.png differ
diff --git a/stable-diffusion.cpp/assets/f16.png b/stable-diffusion.cpp/assets/f16.png
new file mode 100644
index 0000000000000000000000000000000000000000..6aa65196be1d1954586d063b2d93fbacdbaf7fbc
Binary files /dev/null and b/stable-diffusion.cpp/assets/f16.png differ
diff --git a/stable-diffusion.cpp/assets/f32.png b/stable-diffusion.cpp/assets/f32.png
new file mode 100644
index 0000000000000000000000000000000000000000..3faadc41848ba14a50f05b723dc87c1ffa0d3c3b
Binary files /dev/null and b/stable-diffusion.cpp/assets/f32.png differ
diff --git a/stable-diffusion.cpp/assets/img2img_output.png b/stable-diffusion.cpp/assets/img2img_output.png
new file mode 100644
index 0000000000000000000000000000000000000000..80579a184f1597d8eea1df05e5b8e94c67f2a907
Binary files /dev/null and b/stable-diffusion.cpp/assets/img2img_output.png differ
diff --git a/stable-diffusion.cpp/assets/q4_0.png b/stable-diffusion.cpp/assets/q4_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..707e43274bd377c77e1798028c83eed2374bfce5
Binary files /dev/null and b/stable-diffusion.cpp/assets/q4_0.png differ
diff --git a/stable-diffusion.cpp/assets/q4_1.png b/stable-diffusion.cpp/assets/q4_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc1b17cdbdacd4f043f3dfe33ab28495ae43af92
Binary files /dev/null and b/stable-diffusion.cpp/assets/q4_1.png differ
diff --git a/stable-diffusion.cpp/assets/q5_0.png b/stable-diffusion.cpp/assets/q5_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd5d7ee3869b0e66b4c6134cc1fd13e1bbe2efd9
Binary files /dev/null and b/stable-diffusion.cpp/assets/q5_0.png differ
diff --git a/stable-diffusion.cpp/assets/q5_1.png b/stable-diffusion.cpp/assets/q5_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3d08ac695ac8b57a6374a5c1431d7dda91cfa9d9
Binary files /dev/null and b/stable-diffusion.cpp/assets/q5_1.png differ
diff --git a/stable-diffusion.cpp/assets/q8_0.png b/stable-diffusion.cpp/assets/q8_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..3d08ac695ac8b57a6374a5c1431d7dda91cfa9d9
Binary files /dev/null and b/stable-diffusion.cpp/assets/q8_0.png differ
diff --git a/stable-diffusion.cpp/examples/CMakeLists.txt b/stable-diffusion.cpp/examples/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a55396e20bbc7d29fa98da5ee074f1f35dfd0920
--- /dev/null
+++ b/stable-diffusion.cpp/examples/CMakeLists.txt
@@ -0,0 +1,8 @@
+# TODO: move into its own subdirectoy
+# TODO: make stb libs a target (maybe common)
+set(SD_TARGET sd)
+
+add_executable(${SD_TARGET} main.cpp stb_image.h stb_image_write.h)
+install(TARGETS ${SD_TARGET} RUNTIME)
+target_link_libraries(${SD_TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${SD_TARGET} PUBLIC cxx_std_11)
diff --git a/stable-diffusion.cpp/examples/main.cpp b/stable-diffusion.cpp/examples/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2e79ae4617067383f6e2d8c085a60981e5d4846
--- /dev/null
+++ b/stable-diffusion.cpp/examples/main.cpp
@@ -0,0 +1,473 @@
+#include <stdio.h>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+#include <thread>
+#include <unordered_set>
+
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+
+#if !defined(_WIN32)
+#include <sys/ioctl.h>
+#include <unistd.h>
+#endif
+
+#define TXT2IMG "txt2img"
+#define IMG2IMG "img2img"
+
+// get_num_physical_cores is copy from
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
+// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break;  // no more cpus
+        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (siblings.size() > 0) {
+        return static_cast<int32_t>(siblings.size());
+    }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+#elif defined(_WIN32)
+    // TODO: Implement
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
+};
+
+// Names of the sampler method, same order as enum SampleMethod in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2"};
+
+// Names of the sigma schedule overrides, same order as Schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras"};
+
+struct Option {
+    int n_threads = -1;
+    std::string mode = TXT2IMG;
+    std::string model_path;
+    std::string output_path = "output.png";
+    std::string init_img;
+    std::string prompt;
+    std::string negative_prompt;
+    float cfg_scale = 7.0f;
+    int w = 512;
+    int h = 512;
+    SampleMethod sample_method = EULER_A;
+    Schedule schedule = DEFAULT;
+    int sample_steps = 20;
+    float strength = 0.75f;
+    RNGType rng_type = CUDA_RNG;
+    int64_t seed = 42;
+    bool verbose = false;
+
+    void print() {
+        printf("Option: \n");
+        printf("    n_threads:       %d\n", n_threads);
+        printf("    mode:            %s\n", mode.c_str());
+        printf("    model_path:      %s\n", model_path.c_str());
+        printf("    output_path:     %s\n", output_path.c_str());
+        printf("    init_img:        %s\n", init_img.c_str());
+        printf("    prompt:          %s\n", prompt.c_str());
+        printf("    negative_prompt: %s\n", negative_prompt.c_str());
+        printf("    cfg_scale:       %.2f\n", cfg_scale);
+        printf("    width:           %d\n", w);
+        printf("    height:          %d\n", h);
+        printf("    sample_method:   %s\n", sample_method_str[sample_method]);
+        printf("    schedule:        %s\n", schedule_str[schedule]);
+        printf("    sample_steps:    %d\n", sample_steps);
+        printf("    strength:        %.2f\n", strength);
+        printf("    rng:             %s\n", rng_type_to_str[rng_type]);
+        printf("    seed:            %ld\n", seed);
+    }
+};
+
+void print_usage(int argc, const char* argv[]) {
+    printf("usage: %s [arguments]\n", argv[0]);
+    printf("\n");
+    printf("arguments:\n");
+    printf("  -h, --help                         show this help message and exit\n");
+    printf("  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)\n");
+    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
+    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
+    printf("  -m, --model [MODEL]                path to model\n");
+    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
+    printf("  -o, --output OUTPUT                path to write result image to (default: .\\output.png)\n");
+    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
+    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
+    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
+    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
+    printf("                                     1.0 corresponds to full destruction of information in init image\n");
+    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
+    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2}\n");
+    printf("                                     sampling method (default: \"euler_a\")\n");
+    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
+    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
+    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
+    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
+    printf("  -v, --verbose                      print extra info\n");
+}
+
+void parse_args(int argc, const char* argv[], Option* opt) {
+    bool invalid_arg = false;
+
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->n_threads = std::stoi(argv[i]);
+        } else if (arg == "-M" || arg == "--mode") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->mode = argv[i];
+
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->model_path = argv[i];
+        } else if (arg == "-i" || arg == "--init-img") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->init_img = argv[i];
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->output_path = argv[i];
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->prompt = argv[i];
+        } else if (arg == "-n" || arg == "--negative-prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->negative_prompt = argv[i];
+        } else if (arg == "--cfg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--strength") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->strength = std::stof(argv[i]);
+        } else if (arg == "-H" || arg == "--height") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->h = std::stoi(argv[i]);
+        } else if (arg == "-W" || arg == "--width") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->w = std::stoi(argv[i]);
+        } else if (arg == "--steps") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->sample_steps = std::stoi(argv[i]);
+        } else if (arg == "--rng") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string rng_type_str = argv[i];
+            if (rng_type_str == "std_default") {
+                opt->rng_type = STD_DEFAULT_RNG;
+            } else if (rng_type_str == "cuda") {
+                opt->rng_type = CUDA_RNG;
+            } else {
+                invalid_arg = true;
+                break;
+            }
+        } else if (arg == "--schedule") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* schedule_selected = argv[i];
+            int schedule_found = -1;
+            for (int d = 0; d < N_SCHEDULES; d++) {
+                if (!strcmp(schedule_selected, schedule_str[d])) {
+                    schedule_found = d;
+                }
+            }
+            if (schedule_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            opt->schedule = (Schedule)schedule_found;
+        } else if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            opt->seed = std::stoll(argv[i]);
+        } else if (arg == "--sampling-method") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* sample_method_selected = argv[i];
+            int sample_method_found = -1;
+            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+                if (!strcmp(sample_method_selected, sample_method_str[m])) {
+                    sample_method_found = m;
+                }
+            }
+            if (sample_method_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            opt->sample_method = (SampleMethod)sample_method_found;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-v" || arg == "--verbose") {
+            opt->verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv);
+            exit(1);
+        }
+        if (invalid_arg) {
+            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+            print_usage(argc, argv);
+            exit(1);
+        }
+    }
+
+    if (opt->n_threads <= 0) {
+        opt->n_threads = get_num_physical_cores();
+    }
+
+    if (opt->mode != TXT2IMG && opt->mode != IMG2IMG) {
+        fprintf(stderr, "error: invalid mode %s, must be one of ['%s', '%s']\n",
+                opt->mode.c_str(), TXT2IMG, IMG2IMG);
+        exit(1);
+    }
+
+    if (opt->prompt.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: prompt\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (opt->model_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: model_path\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (opt->mode == IMG2IMG && opt->init_img.length() == 0) {
+        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (opt->output_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: output_path\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (opt->w <= 0 || opt->w % 64 != 0) {
+        fprintf(stderr, "error: the width must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (opt->h <= 0 || opt->h % 64 != 0) {
+        fprintf(stderr, "error: the height must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (opt->sample_steps <= 0) {
+        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
+        exit(1);
+    }
+
+    if (opt->strength < 0.f || opt->strength > 1.f) {
+        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
+        exit(1);
+    }
+
+    if (opt->seed < 0) {
+        srand((int)time(NULL));
+        opt->seed = rand();
+    }
+}
+
+std::string basename(const std::string& path) {
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    pos = path.find_last_of('\\');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    return path;
+}
+
+int main(int argc, const char* argv[]) {
+    Option opt;
+    parse_args(argc, argv, &opt);
+
+    if (opt.verbose) {
+        opt.print();
+        printf("%s", sd_get_system_info().c_str());
+        set_sd_log_level(SDLogLevel::DEBUG);
+    }
+
+    bool vae_decode_only = true;
+    std::vector<uint8_t> init_img;
+    if (opt.mode == IMG2IMG) {
+        vae_decode_only = false;
+
+        int c = 0;
+        unsigned char* img_data = stbi_load(opt.init_img.c_str(), &opt.w, &opt.h, &c, 3);
+        if (img_data == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", opt.init_img.c_str());
+            return 1;
+        }
+        if (c != 3) {
+            fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
+            free(img_data);
+            return 1;
+        }
+        if (opt.w <= 0 || opt.w % 64 != 0) {
+            fprintf(stderr, "error: the width of image must be a multiple of 64\n");
+            free(img_data);
+            return 1;
+        }
+        if (opt.h <= 0 || opt.h % 64 != 0) {
+            fprintf(stderr, "error: the height of image must be a multiple of 64\n");
+            free(img_data);
+            return 1;
+        }
+        init_img.assign(img_data, img_data + (opt.w * opt.h * c));
+    }
+
+    StableDiffusion sd(opt.n_threads, vae_decode_only, true, opt.rng_type);
+    if (!sd.load_from_file(opt.model_path, opt.schedule)) {
+        return 1;
+    }
+
+    std::vector<uint8_t> img;
+    if (opt.mode == TXT2IMG) {
+        img = sd.txt2img(opt.prompt,
+                         opt.negative_prompt,
+                         opt.cfg_scale,
+                         opt.w,
+                         opt.h,
+                         opt.sample_method,
+                         opt.sample_steps,
+                         opt.seed);
+    } else {
+        img = sd.img2img(init_img,
+                         opt.prompt,
+                         opt.negative_prompt,
+                         opt.cfg_scale,
+                         opt.w,
+                         opt.h,
+                         opt.sample_method,
+                         opt.sample_steps,
+                         opt.strength,
+                         opt.seed);
+    }
+
+    if (img.size() == 0) {
+        fprintf(stderr, "generate failed\n");
+        return 1;
+    }
+
+    std::string parameter_string = opt.prompt + "\n";
+    if (opt.negative_prompt.size() != 0) {
+        parameter_string += "Negative prompt: " + opt.negative_prompt + "\n";
+    }
+    parameter_string += "Steps: " + std::to_string(opt.sample_steps) + ", ";
+    parameter_string += "CFG scale: " + std::to_string(opt.cfg_scale) + ", ";
+    parameter_string += "Seed: " + std::to_string(opt.seed) + ", ";
+    parameter_string += "Size: " + std::to_string(opt.w) + "x" + std::to_string(opt.h) + ", ";
+    parameter_string += "Model: " + basename(opt.model_path) + ", ";
+    parameter_string += "RNG: " + std::string(rng_type_to_str[opt.rng_type]) + ", ";
+    parameter_string += "Sampler: " + std::string(sample_method_str[opt.sample_method]);
+    if (opt.schedule == KARRAS) {
+        parameter_string += " karras";
+    }
+    parameter_string += ", ";
+    parameter_string += "Version: stable-diffusion.cpp";
+
+    stbi_write_png(opt.output_path.c_str(), opt.w, opt.h, 3, img.data(), 0, parameter_string.c_str());
+    printf("save result image to '%s'\n", opt.output_path.c_str());
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/examples/stb_image.h b/stable-diffusion.cpp/examples/stb_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e807a0a6e7cdbfbbf48dff5f5d3f3693c2bc851
--- /dev/null
+++ b/stable-diffusion.cpp/examples/stb_image.h
@@ -0,0 +1,7987 @@
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
+
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/stable-diffusion.cpp/examples/stb_image_write.h b/stable-diffusion.cpp/examples/stb_image_write.h
new file mode 100644
index 0000000000000000000000000000000000000000..5589a7ec21acd1610ae133712a789fa5b04e81a8
--- /dev/null
+++ b/stable-diffusion.cpp/examples/stb_image_write.h
@@ -0,0 +1,1741 @@
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
+
+   Before #including,
+
+       #define STB_IMAGE_WRITE_IMPLEMENTATION
+
+   in the file that you want to have the implementation.
+
+   Will probably not work correctly with strict-aliasing optimizations.
+
+ABOUT:
+
+   This header file is a library for writing images to C stdio or a callback.
+
+   The PNG output is not optimal; it is 20-50% larger than the file
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
+
+BUILDING:
+
+   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
+   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
+   malloc,realloc,free.
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+
+UNICODE:
+
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
+
+USAGE:
+
+   There are five functions, one for each image file format:
+
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
+     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
+
+   Each function returns 0 on failure and non-0 on success.
+
+   The functions create an image file defined by the parameters. The image
+   is a rectangle of pixels stored from left-to-right, top-to-bottom.
+   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
+   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
+   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
+   The *data pointer points to the first byte of the top-left-most pixel.
+   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
+   a row of pixels to the first byte of the next row of pixels.
+
+   PNG creates output files with the same number of components as the input.
+   The BMP format expands Y to RGB in the file format and does not
+   output alpha.
+
+   PNG supports writing rectangles of data even when the bytes storing rows of
+   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
+   by supplying the stride between the beginning of adjacent rows. The other
+   formats do not. (Thus you cannot write a native-format BMP through the BMP
+   writer, both because it is in BGR order and because it may have padding
+   at the end of the line.)
+
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
+   HDR expects linear float data. Since the format is always 32-bit rgb(e)
+   data, alpha (if provided) is discarded, and for monochrome data it is
+   replicated across all three channels.
+
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
+CREDITS:
+
+
+   Sean Barrett           -    PNG/BMP/TGA
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
+   bugfixes:
+      github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+      Andrew Kensler
+
+LICENSE
+
+  See end of file for license information.
+
+*/
+
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#define INCLUDE_STB_IMAGE_WRITE_H
+
+#include <stdlib.h>
+
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
+#ifdef __cplusplus
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
+#endif
+
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes, const char* parameters = NULL);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+#ifdef STBIW_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+#endif
+
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+
+#endif//INCLUDE_STB_IMAGE_WRITE_H
+
+#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
+
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+// ok
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#endif
+
+#ifndef STBIW_MALLOC
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#endif
+
+
+#ifndef STBIW_MEMMOVE
+#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#endif
+
+
+#ifndef STBIW_ASSERT
+#include <assert.h>
+#define STBIW_ASSERT(x) assert(x)
+#endif
+
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+static int stbi__flip_vertically_on_write = 0;
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+   unsigned char buffer[64];
+   int buf_used;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = stbiw__fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
+typedef unsigned int stbiw_uint32;
+typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
+
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
+{
+   while (*fmt) {
+      switch (*fmt++) {
+         case ' ': break;
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
+         default:
+            STBIW_ASSERT(0);
+            return;
+      }
+   }
+}
+
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__write_flush(stbi__write_context *s)
+{
+   if (s->buf_used) {
+      s->func(s->context, &s->buffer, s->buf_used);
+      s->buf_used = 0;
+   }
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write1(stbi__write_context *s, unsigned char a)
+{
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   s->buffer[s->buf_used++] = a;
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
+{
+   int n;
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   n = s->buf_used;
+   s->buf_used = n+3;
+   s->buffer[n+0] = a;
+   s->buffer[n+1] = b;
+   s->buffer[n+2] = c;
+}
+
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
+{
+   unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      stbiw__write1(s, d[comp - 1]);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            stbiw__write1(s, d[0]);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      stbiw__write1(s, d[comp - 1]);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
+   stbiw_uint32 zero = 0;
+   int i,j, j_end;
+
+   if (y <= 0)
+      return;
+
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
+   if (vdir < 0) {
+      j_end = -1; j = y-1;
+   } else {
+      j_end =  y; j = 0;
+   }
+
+   for (; j != j_end; j += vdir) {
+      for (i=0; i < x; ++i) {
+         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+      }
+      stbiw__write_flush(s);
+      s->func(s->context, &zero, scanline_pad);
+   }
+}
+
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+{
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
+      va_list v;
+      va_start(v, fmt);
+      stbiw__writefv(s, fmt, v);
+      va_end(v);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
+   }
+}
+
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
+{
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
+}
+
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
+{
+   int has_alpha = (comp == 2 || comp == 4);
+   int colorbytes = has_alpha ? comp-1 : comp;
+   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               stbiw__write1(s, header);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               stbiw__write1(s, header);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+      stbiw__write_flush(s);
+   }
+   return 1;
+}
+
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR writer
+// by Baldur Karlsson
+
+#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+{
+   int exponent;
+   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+
+   if (maxcomp < 1e-32f) {
+      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+   } else {
+      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+
+      rgbe[0] = (unsigned char)(linear[0] * normalize);
+      rgbe[1] = (unsigned char)(linear[1] * normalize);
+      rgbe[2] = (unsigned char)(linear[2] * normalize);
+      rgbe[3] = (unsigned char)(exponent + 128);
+   }
+}
+
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
+   STBIW_ASSERT(length+128 <= 255);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
+}
+
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length);
+   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
+}
+
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+{
+   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
+   unsigned char rgbe[4];
+   float linear[3];
+   int x;
+
+   scanlineheader[2] = (width&0xff00)>>8;
+   scanlineheader[3] = (width&0x00ff);
+
+   /* skip RLE for images too small or large */
+   if (width < 8 || width >= 32768) {
+      for (x=0; x < width; x++) {
+         switch (ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         s->func(s->context, rgbe, 4);
+      }
+   } else {
+      int c,r;
+      /* encode into scratch buffer */
+      for (x=0; x < width; x++) {
+         switch(ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         scratch[x + width*0] = rgbe[0];
+         scratch[x + width*1] = rgbe[1];
+         scratch[x + width*2] = rgbe[2];
+         scratch[x + width*3] = rgbe[3];
+      }
+
+      s->func(s->context, scanlineheader, 4);
+
+      /* RLE each component separately */
+      for (c=0; c < 4; c++) {
+         unsigned char *comp = &scratch[width*c];
+
+         x = 0;
+         while (x < width) {
+            // find first run
+            r = x;
+            while (r+2 < width) {
+               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
+                  break;
+               ++r;
+            }
+            if (r+2 >= width)
+               r = width;
+            // dump up to first run
+            while (x < r) {
+               int len = r-x;
+               if (len > 128) len = 128;
+               stbiw__write_dump_data(s, len, &comp[x]);
+               x += len;
+            }
+            // if there's a run, output it
+            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
+               // find next byte after run
+               while (r < width && comp[r] == comp[x])
+                  ++r;
+               // output run up to r
+               while (x < r) {
+                  int len = r-x;
+                  if (len > 127) len = 127;
+                  stbiw__write_run_data(s, len, comp[x]);
+                  x += len;
+               }
+            }
+         }
+      }
+   }
+}
+
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
+{
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
+      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef __STDC_LIB_EXT1__
+      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
+      for(i=0; i < y; i++)
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
+      STBIW_FREE(scratch);
+      return 1;
+   }
+}
+
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
+#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
+
+#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
+#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
+
+#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
+
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      stbiw__sbm(*arr) = m;
+   }
+   return *arr;
+}
+
+static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+
+static int stbiw__zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+
+static unsigned int stbiw__zhash(unsigned char *data)
+{
+   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+
+#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+// default huffman tables
+#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
+#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
+#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+
+#define stbiw__ZHASH   16384
+
+#endif // STBIW_ZLIB_COMPRESS
+
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
+   if (hash_table == NULL)
+      return NULL;
+   if (quality < 5) quality = 5;
+
+   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
+   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
+   stbiw__zlib_add(1,1);  // BFINAL = 1
+   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      hash_table[i] = NULL;
+
+   i=0;
+   while (i < data_len-3) {
+      // hash next 3 bytes of data to be compressed
+      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = stbiw__sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) { // if entry lies within window
+            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) { best=d; bestloc=hlist[j]; }
+         }
+      }
+      // when hash table entry is too long, delete half the entries
+      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         stbiw__sbn(hash_table[h]) = quality;
+      }
+      stbiw__sbpush(hash_table[h],data+i);
+
+      if (bestloc) {
+         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
+         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
+         hlist = hash_table[h];
+         n = stbiw__sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) { // if next match is better, bail on current match
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (bestloc) {
+         int d = (int) (data+i - bestloc); // distance back
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         stbiw__zlib_huff(j+257);
+         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
+         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         stbiw__zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   // write out final bytes
+   for (;i < data_len; ++i)
+      stbiw__zlib_huffb(data[i]);
+   stbiw__zlib_huff(256); // end of block
+   // pad with 0 bits to byte boundary
+   while (bitcount)
+      stbiw__zlib_add(0,1);
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
+   {
+      // compute adler32 on input
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
+         s1 %= 65521; s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = stbiw__sbn(out);
+   // make returned pointer freeable
+   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+   return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
+}
+
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
+{
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
+   unsigned int crc = ~0u;
+   int i;
+   for (i=0; i < len; ++i)
+      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+   return ~crc;
+#endif
+}
+
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
+#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
+#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+
+static void stbiw__wpcrc(unsigned char **data, int len)
+{
+   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
+   stbiw__wp32(*data, crc);
+}
+
+static unsigned char stbiw__paeth(int a, int b, int c)
+{
+   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+
+   // first loop isn't optimized since it's just one pixel
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len, const char* parameters)
+{
+   int force_filter = stbi_write_force_png_filter;
+   int param_length = 0;
+   int ctype[5] = { -1, 0, 4, 2, 6 };
+   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
+   unsigned char *out,*o, *filt, *zlib;
+   signed char *line_buffer;
+   int j,zlen;
+
+   if (stride_bytes == 0)
+      stride_bytes = x * n;
+
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
+   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
+   for (j=0; j < y; ++j) {
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
+               est += abs((signed char) line_buffer[i]);
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
+         }
+      }
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
+      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+   }
+   STBIW_FREE(line_buffer);
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
+   STBIW_FREE(filt);
+   if (!zlib) return 0;
+
+   if(parameters != NULL) {
+      param_length = strlen(parameters);
+      param_length += strlen("parameters") + 1; // For the name and the null-byte
+   }
+
+   // each tag requires 12 bytes of overhead
+   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12 + ((parameters)?(param_length+12):0));
+   if (!out) return 0;
+   *out_len = 8 + 12+13 + 12+zlen + 12 + ((parameters)?(param_length+12):0);
+
+   o=out;
+   STBIW_MEMMOVE(o,sig,8); o+= 8;
+   stbiw__wp32(o, 13); // header length
+   stbiw__wptag(o, "IHDR");
+   stbiw__wp32(o, x);
+   stbiw__wp32(o, y);
+   *o++ = 8;
+   *o++ = STBIW_UCHAR(ctype[n]);
+   *o++ = 0;
+   *o++ = 0;
+   *o++ = 0;
+   stbiw__wpcrc(&o,13);
+
+   if(parameters != NULL) {
+      stbiw__wp32(o, param_length);
+      stbiw__wptag(o, "tEXt");
+      STBIW_MEMMOVE(o, "parameters", strlen("parameters"));
+      o+=strlen("parameters");
+      *o++ = 0; // Null pyte separator
+      STBIW_MEMMOVE(o, parameters, strlen(parameters));
+      o+=strlen(parameters);
+      stbiw__wpcrc(&o, param_length);
+   }
+
+   stbiw__wp32(o, zlen);
+   stbiw__wptag(o, "IDAT");
+   STBIW_MEMMOVE(o, zlib, zlen);
+   o += zlen;
+   STBIW_FREE(zlib);
+   stbiw__wpcrc(&o, zlen);
+
+   stbiw__wp32(o,0);
+   stbiw__wptag(o, "IEND");
+   stbiw__wpcrc(&o,0);
+
+   STBIW_ASSERT(o == out + *out_len);
+
+   return out;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes, const char* parameters)
+{
+   FILE *f;
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len, parameters);
+   if (png == NULL) return 0;
+
+   f = stbiw__fopen(filename, "wb");
+   if (!f) { STBIW_FREE(png); return 0; }
+   fwrite(png, 1, len, f);
+   fclose(f);
+   STBIW_FREE(png);
+   return 1;
+}
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len, NULL);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, j, n, diff, end0pos, x, y;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
+                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(y = 0, j=0; y < 8; ++y) {
+      for(x = 0; x < 8; ++x,++j) {
+         float v;
+         i = y*du_stride+x;
+         v = CDU[i]*fdtbl[j];
+         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+      }
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k, subsample;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   subsample = quality <= 90 ? 1 : 0;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      const unsigned char *dataR = (const unsigned char *)data;
+      const unsigned char *dataG = dataR + ofsG;
+      const unsigned char *dataB = dataR + ofsB;
+      int x, y, pos;
+      if(subsample) {
+         for(y = 0; y < height; y += 16) {
+            for(x = 0; x < width; x += 16) {
+               float Y[256], U[256], V[256];
+               for(row = y, pos = 0; row < y+16; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+16; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+
+               // subsample U,V
+               {
+                  float subU[64], subV[64];
+                  int yy, xx;
+                  for(yy = 0, pos = 0; yy < 8; ++yy) {
+                     for(xx = 0; xx < 8; ++xx, ++pos) {
+                        int j = yy*32+xx*2;
+                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
+                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
+                     }
+                  }
+                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+               }
+            }
+         }
+      } else {
+         for(y = 0; y < height; y += 8) {
+            for(x = 0; x < width; x += 8) {
+               float Y[64], U[64], V[64];
+               for(row = y, pos = 0; row < y+8; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+8; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
+               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+            }
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
+
+/* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
+      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
+      1.13
+      1.12
+      1.11  (2019-08-11)
+
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
+      0.98 (2015-04-08)
+             added STBIW_MALLOC, STBIW_ASSERT etc
+      0.97 (2015-01-18)
+             fixed HDR asserts, rewrote HDR rle logic
+      0.96 (2015-01-17)
+             add HDR output
+             fix monochrome BMP
+      0.95 (2014-08-17)
+             add monochrome TGA output
+      0.94 (2014-05-31)
+             rename private functions to avoid conflicts with stb_image.h
+      0.93 (2014-05-27)
+             warning fixes
+      0.92 (2010-08-01)
+             casts to unsigned char to fix warnings
+      0.91 (2010-07-17)
+             first public release
+      0.90   first internal release
+*/
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/stable-diffusion.cpp/ggml/.editorconfig b/stable-diffusion.cpp/ggml/.editorconfig
new file mode 100644
index 0000000000000000000000000000000000000000..135a7e4bce5a168e13818ac47da04a693ce8ff8d
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/.editorconfig
@@ -0,0 +1,19 @@
+# https://EditorConfig.org
+
+# Top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file, utf-8 charset
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+charset = utf-8
+indent_style = space
+indent_size = 4
+
+[Makefile]
+indent_style = tab
+
+[prompts/*.txt]
+insert_final_newline = unset
diff --git a/stable-diffusion.cpp/ggml/.github/workflows/ci.yml b/stable-diffusion.cpp/ggml/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4e5c633af55332fad0135edf5d86243d1cc66e00
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/.github/workflows/ci.yml
@@ -0,0 +1,137 @@
+name: CI
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  test-ubuntu-opencl:
+    runs-on: ubuntu-latest
+    env:
+      GGML_NLOOP: 3
+      GGML_NITER: 1
+      GGML_N_THREADS: 2
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Dependencies
+      run: |
+        wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+        echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list  
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
+    - name: Create Build Environment
+      run: mkdir build
+
+    - name: Configure CMake
+      working-directory: ./build
+      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_CLBLAST=ON ..
+
+    - name: Build
+      working-directory: ./build
+      run: make
+
+    - name: Test
+      working-directory: ./build
+      run: ctest --verbose --timeout 900
+
+    - name: Test Coverage
+      working-directory: ./build
+      run: |
+        llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
+        llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
+        llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
+  test-macos-metal:
+    runs-on: macos-13
+    env:
+      GGML_NLOOP: 3
+      GGML_NITER: 1
+      GGML_N_THREADS: 2
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Create Build Environment
+      run: mkdir build
+
+    - name: Configure CMake
+      working-directory: ./build
+      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_METAL=ON ..
+
+    - name: Build
+      working-directory: ./build
+      run: make
+
+    - name: Test
+      working-directory: ./build
+      run: ctest --verbose --timeout 900
+
+    - name: Test Coverage
+      working-directory: ./build
+      run: |
+        xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
+        xcrun llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
+        xcrun llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
+
+  build:
+
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    runs-on: ${{ matrix.os }}
+
+    env:
+      GGML_NLOOP: 3
+      GGML_NITER: 1
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Dependencies for Ubuntu
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt-get update
+        sudo apt-get install llvm
+
+    - name: Set GGML_N_THREADS for Ubuntu
+      run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
+      if: matrix.os == 'ubuntu-latest'
+
+    - name: Set GGML_N_THREADS for MacOS
+      run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
+      if: matrix.os == 'macos-latest'
+
+    - name: Create Build Environment
+      run: mkdir build
+
+    - name: Configure CMake
+      working-directory: ./build
+      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
+
+    - name: Build
+      working-directory: ./build
+      run: make
+
+    - name: Test
+      working-directory: ./build
+      run: ctest --verbose --timeout 900
+
+    - name: Test Coverage for Ubuntu
+      if: matrix.os == 'ubuntu-latest'
+      working-directory: ./build
+      run: |
+        llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
+        llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
+        llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
+
+    - name: Test Coverage for MacOS
+      if: matrix.os == 'macos-latest'
+      working-directory: ./build
+      run: |
+        xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
+        xcrun llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
+        xcrun llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
diff --git a/stable-diffusion.cpp/ggml/.gitignore b/stable-diffusion.cpp/ggml/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..35c37674d9cd6c54634da202220eab5c3de2ac0f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/.gitignore
@@ -0,0 +1,37 @@
+build/
+build-debug/
+build-release/
+build-sanitize-addr/
+build-sanitize-thread/
+build-cov/
+build-ci-debug/
+build-ci-release/
+build-cublas/
+out/
+tmp/
+models/
+models-mnt
+
+compile_commands.json
+CMakeSettings.json
+.vs/
+.vscode/
+.clangd
+
+.exrc
+.cache
+.DS_Store
+.stablelm
+.gpt-2
+
+src/arm_neon.h
+tests/arm_neon.h
+
+zig-out/
+zig-cache/
+
+*.dot
+
+*.sw?
+
+__pycache__/
diff --git a/stable-diffusion.cpp/ggml/CMakeLists.txt b/stable-diffusion.cpp/ggml/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db1179d035bdb36c33668f3906cc94be262872f0
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/CMakeLists.txt
@@ -0,0 +1,197 @@
+cmake_minimum_required (VERSION 3.3)
+project(ggml VERSION 0.1.0)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(GGML_STANDALONE ON)
+    include(cmake/GitVars.cmake)
+    include(cmake/BuildTypes.cmake)
+else()
+    set(GGML_STANDALONE OFF)
+endif()
+
+if (EMSCRIPTEN)
+    set(BUILD_SHARED_LIBS_DEFAULT OFF)
+else()
+    if (MINGW)
+        set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    else()
+        set(BUILD_SHARED_LIBS_DEFAULT ON)
+    endif()
+endif()
+
+# options
+
+option(BUILD_SHARED_LIBS            "ggml: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
+
+option(GGML_ALL_WARNINGS            "ggml: enable all compiler warnings"                   ON)
+option(GGML_ALL_WARNINGS_3RD_PARTY  "ggml: enable all compiler warnings in 3rd party libs" OFF)
+
+option(GGML_SANITIZE_THREAD         "ggml: enable thread sanitizer"    OFF)
+option(GGML_SANITIZE_ADDRESS        "ggml: enable address sanitizer"   OFF)
+option(GGML_SANITIZE_UNDEFINED      "ggml: enable undefined sanitizer" OFF)
+
+option(GGML_BUILD_TESTS             "ggml: build tests"    ${GGML_STANDALONE})
+option(GGML_BUILD_EXAMPLES          "ggml: build examples" ${GGML_STANDALONE})
+
+option(GGML_TEST_COVERAGE           "ggml: enable test coverage" OFF)
+
+option(GGML_PERF                    "ggml: enable perf timings"          OFF)
+option(GGML_NO_ACCELERATE           "ggml: disable Accelerate framework" OFF)
+option(GGML_OPENBLAS                "ggml: use OpenBLAS"                 OFF)
+option(GGML_CLBLAST                 "ggml: use clBLAST"                  OFF)
+option(GGML_CUBLAS                  "ggml: use cuBLAS"                   OFF)
+option(GGML_METAL                   "ggml: use Metal"                    OFF)
+
+# sanitizers
+
+if (GGML_SANITIZE_THREAD)
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
+endif()
+
+if (GGML_SANITIZE_ADDRESS)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+endif()
+
+if (GGML_SANITIZE_UNDEFINED)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
+endif()
+
+# instruction set specific
+option(GGML_AVX                     "ggml: enable AVX"                                     ON)
+option(GGML_AVX2                    "ggml: enable AVX2"                                    ON)
+option(GGML_AVX512                  "ggml: enable AVX512"                                  OFF)
+option(GGML_AVX512_VBMI             "ggml: enable AVX512-VBMI"                             OFF)
+option(GGML_AVX512_VNNI             "ggml: enable AVX512-VNNI"                             OFF)
+option(GGML_FMA                     "ggml: enable FMA"                                     ON)
+# in MSVC F16C is implied with AVX2/AVX512
+if (NOT MSVC)
+    option(GGML_F16C                "ggml: enable F16C"                                    ON)
+endif()
+
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
+
+# warning flags
+
+if (GGML_ALL_WARNINGS)
+    if (NOT MSVC)
+        set(c_flags   -Wall -Wpedantic -Wformat=2 -Wno-unused -Wstrict-prototypes)
+        set(cxx_flags -Wall -Wpedantic -Wformat=2)
+    else()
+        # todo : windows
+    endif()
+
+    add_compile_options(
+        "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+    )
+endif()
+
+if (NOT MSVC)
+    add_compile_options(
+        "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
+        "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
+    )
+endif()
+
+#
+# POSIX conformance
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+add_compile_definitions(_XOPEN_SOURCE=600)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    remove_definitions(-D_XOPEN_SOURCE=600)
+    add_compile_definitions(_XOPEN_SOURCE=700)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity
+# are available on Linux through GNU extensions in libc
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_BSD_SOURCE)
+endif()
+
+if (WHISPER_PERF)
+    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
+endif()
+
+# dependencies
+
+set(CMAKE_C_STANDARD   11)
+set(CMAKE_CXX_STANDARD 11)
+
+find_package(Threads REQUIRED)
+
+# main
+
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
+endif ()
+
+if (GGML_BUILD_TESTS)
+    if (GGML_TEST_COVERAGE)
+        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fprofile-instr-generate -fcoverage-mapping")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
+        else()
+            message(WARNING "Test coverage is only supported for Clang")
+        endif()
+    endif()
+endif()
+
+add_subdirectory(src)
+
+if (GGML_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif ()
+
+if (GGML_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif ()
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
+               ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+               @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        DESTINATION share/pkgconfig)
diff --git a/stable-diffusion.cpp/ggml/LICENSE b/stable-diffusion.cpp/ggml/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..fb7ff0ce4a7626a4e76edffcb7b20fbc59ba9b19
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Georgi Gerganov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/stable-diffusion.cpp/ggml/README.md b/stable-diffusion.cpp/ggml/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..982f12bf5e6761e42718f35b093dcd8a3662ded7
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/README.md
@@ -0,0 +1,140 @@
+# ggml
+
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205)
+
+Tensor library for machine learning
+
+***Note that this project is under active development. \
+Some of the development is currently happening in the [llama.cpp](https://github.com/ggerganov/llama.cpp) and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repos***
+
+## Features
+
+- Written in C
+- 16-bit float support
+- Integer quantization support (4-bit, 5-bit, 8-bit, etc.)
+- Automatic differentiation
+- ADAM and L-BFGS optimizers
+- Optimized for Apple Silicon
+- On x86 architectures utilizes AVX / AVX2 intrinsics
+- On ppc64 architectures utilizes VSX intrinsics
+- No third-party dependencies
+- Zero memory allocations during runtime
+
+## Updates
+
+- [X] Example of GPT-2 inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
+- [X] Example of GPT-J inference [examples/gpt-j](https://github.com/ggerganov/ggml/tree/master/examples/gpt-j)
+- [X] Example of Whisper inference [examples/whisper](https://github.com/ggerganov/ggml/tree/master/examples/whisper)
+- [X] Support 4-bit integer quantization https://github.com/ggerganov/ggml/pull/27
+- [X] Example of Cerebras-GPT inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
+- [ ] Example of FLAN-T5 inference https://github.com/ggerganov/ggml/pull/12
+- [X] Example of LLaMA inference [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
+- [X] Example of LLaMA training [ggerganov/llama.cpp/examples/baby-llama](https://github.com/ggerganov/llama.cpp/tree/master/examples/baby-llama)
+- [X] Example of Falcon inference [cmp-nct/ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp)
+- [X] Example of BLOOM inference [NouamaneTazi/bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp)
+- [X] Example of RWKV inference [saharNooby/rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)
+- [X] Example of SAM inference [examples/sam](https://github.com/ggerganov/ggml/tree/master/examples/sam)
+- [X] Idea for GPU support: https://github.com/ggerganov/llama.cpp/discussions/915
+- [X] Example of StableLM (GPT-NeoX) inference [examples/gpt-neox](https://github.com/ggerganov/ggml/tree/master/examples/gpt-neox)
+- [X] Example of BERT inference [skeskinen/bert.cpp](https://github.com/skeskinen/bert.cpp)
+- [X] Example of 💫 StarCoder inference [examples/starcoder](https://github.com/ggerganov/ggml/tree/master/examples/starcoder)
+- [X] Example of MPT inference [examples/mpt](https://github.com/ggerganov/ggml/tree/master/examples/mpt)
+- [X] Example of Replit inference [examples/replit](https://github.com/ggerganov/ggml/tree/master/examples/replit)
+- [X] Example of BioGPT inference [PABannier/biogpt.cpp](https://github.com/PABannier/biogpt.cpp)
+- [X] Example of Encodec inference [PABannier/encodec.cpp](https://github.com/PABannier/encodec.cpp)
+- [X] Example of CLIP inference [monatis/clip.cpp](https://github.com/monatis/clip.cpp)
+- [X] Example of MiniGPT4 inference [Maknee/minigpt4.cpp](https://github.com/Maknee/minigpt4.cpp)
+- [X] Example of ChatGLM inference [li-plus/chatglm.cpp](https://github.com/li-plus/chatglm.cpp)
+- [X] Example of Stable Diffusion inference [leejet/stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)
+- [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
+
+## Whisper inference (example)
+
+With ggml you can efficiently run [Whisper](examples/whisper) inference on the CPU.
+
+Memory requirements:
+
+| Model  | Disk   | Mem     |
+| ---    | ---    | ---     |
+| tiny   |  75 MB | ~280 MB |
+| base   | 142 MB | ~430 MB |
+| small  | 466 MB | ~1.0 GB |
+| medium | 1.5 GB | ~2.6 GB |
+| large  | 2.9 GB | ~4.7 GB |
+
+## GPT inference (example)
+
+With ggml you can efficiently run [GPT-2](examples/gpt-2) and [GPT-J](examples/gpt-j) inference on the CPU.
+
+Here is how to run the example programs:
+
+```bash
+# Build ggml + examples
+git clone https://github.com/ggerganov/ggml
+cd ggml
+mkdir build && cd build
+cmake ..
+make -j4 gpt-2 gpt-j
+
+# Run the GPT-2 small 117M model
+../examples/gpt-2/download-ggml-model.sh 117M
+./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
+
+# Run the GPT-J 6B model (requires 12GB disk space and 16GB CPU RAM)
+../examples/gpt-j/download-ggml-model.sh 6B
+./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin -p "This is an example"
+
+# Install Python dependencies
+python3 -m pip install -r ../requirements.txt
+
+# Run the Cerebras-GPT 111M model
+# Download from: https://huggingface.co/cerebras
+python3 ../examples/gpt-2/convert-cerebras-to-ggml.py /path/to/Cerebras-GPT-111M/
+./bin/gpt-2 -m /path/to/Cerebras-GPT-111M/ggml-model-f16.bin -p "This is an example"
+```
+
+The inference speeds that I get for the different models on my 32GB MacBook M1 Pro are as follows:
+
+| Model | Size  | Time / Token |
+| ---   | ---   | ---    |
+| GPT-2 |  117M |   5 ms |
+| GPT-2 |  345M |  12 ms |
+| GPT-2 |  774M |  23 ms |
+| GPT-2 | 1558M |  42 ms |
+| ---   | ---   | ---    |
+| GPT-J |    6B | 125 ms |
+
+For more information, checkout the corresponding programs in the [examples](examples) folder.
+
+## Using Metal (only with GPT-2)
+
+For GPT-2 models, offloading to GPU is possible. Note that it will not improve inference performances but will reduce power consumption and free up the CPU for other tasks.
+
+To enable GPU offloading on MacOS:
+
+```bash
+cmake -DGGML_METAL=ON -DBUILD_SHARED_LIBS=Off ..
+
+# add -ngl 1
+./bin/gpt-2 -t 4 -ngl 100 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
+```
+
+## Using cuBLAS
+
+```bash
+# fix the path to point to your CUDA compiler
+cmake -DGGML_CUBLAS=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc ..
+```
+
+## Using clBLAST
+
+```bash
+cmake -DGGML_CLBLAST=ON ..
+```
+
+## Resources
+
+- [GGML - Large Language Models for Everyone](https://github.com/rustformers/llm/blob/main/crates/ggml/README.md): a description of the GGML format provided by the maintainers of the `llm` Rust crate, which provides Rust bindings for GGML
+- [marella/ctransformers](https://github.com/marella/ctransformers): Python bindings for GGML models.
+- [go-skynet/go-ggml-transformers.cpp](https://github.com/go-skynet/go-ggml-transformers.cpp): Golang bindings for GGML models
+- [smspillaz/ggml-gobject](https://github.com/smspillaz/ggml-gobject): GObject-introspectable wrapper for use of GGML on the GNOME platform.
diff --git a/stable-diffusion.cpp/ggml/build.zig b/stable-diffusion.cpp/ggml/build.zig
new file mode 100644
index 0000000000000000000000000000000000000000..5aa379dc1558ef5fa8a0dc6865ef47b955961428
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/build.zig
@@ -0,0 +1,158 @@
+const std = @import("std");
+const builtin = @import("builtin");
+
+// Zig Version: 0.11.0
+// Zig Build Command: zig build
+// Zig Run Command: zig build -h
+//     zig build run_dolly-v2
+//     zig build run_gpt-2
+//     zig build run_gpt-j
+//     zig build run_gpt-neox
+//     zig build run_mnist
+//     zig build run_mpt
+//     zig build run_replit
+//     zig build run_starcoder
+//     zig build run_test-grad0
+//     zig build run_test-mul-mat0
+//     zig build run_test-mul-mat2
+//     zig build run_test-opt
+//     zig build run_test-vec1
+//     zig build run_test0
+//     zig build run_test1
+//     zig build run_test2
+//     zig build run_test3
+//     zig build run_zig_test0
+//     zig build run_zig_test1
+//     zig build run_zig_test2
+//     zig build run_zig_test3
+pub fn build(b: *std.build.Builder) void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
+    const lib = b.addStaticLibrary(.{
+        .name = "ggml",
+        .target = target,
+        .optimize = optimize,
+    });
+    lib.addIncludePath(.{ .path = "./include" });
+    lib.addIncludePath(.{ .path = "./include/ggml" });
+    lib.addCSourceFiles(&.{
+        "src/ggml.c",
+    }, &.{"-std=c11"});
+    lib.linkLibC();
+    lib.linkLibCpp();
+    b.installArtifact(lib);
+
+    // examples
+    const examples = .{
+        "dolly-v2",
+        "gpt-2",
+        "gpt-j",
+        "gpt-neox",
+        "mnist",
+        "mpt",
+        "replit",
+        "starcoder",
+        // "whisper",
+    };
+    inline for (examples) |name| {
+        const exe = b.addExecutable(.{
+            .name = name,
+            .target = target,
+            .optimize = optimize,
+        });
+        exe.addIncludePath(.{ .path = "./include" });
+        exe.addIncludePath(.{ .path = "./include/ggml" });
+        exe.addIncludePath(.{ .path = "./examples" });
+        // exe.addIncludePath("./examples/whisper");
+        exe.addCSourceFiles(&.{
+            std.fmt.comptimePrint("examples/{s}/main.cpp", .{name}),
+            "examples/common.cpp",
+            "examples/common-ggml.cpp",
+            // "examples/whisper/whisper.cpp",
+        }, &.{"-std=c++11"});
+        exe.linkLibrary(lib);
+        b.installArtifact(exe);
+        const run_cmd = b.addRunArtifact(exe);
+        run_cmd.step.dependOn(b.getInstallStep());
+        if (b.args) |args| run_cmd.addArgs(args);
+        const run_step = b.step("run_" ++ name, "Run examples");
+        run_step.dependOn(&run_cmd.step);
+    }
+
+    // tests
+    const tests = if (builtin.target.cpu.arch == .x86_64) .{
+        // "test-blas0",
+        // "test-grad0",
+        "test-mul-mat0",
+        // "test-mul-mat1",
+        "test-mul-mat2",
+        // "test-opt",
+        // "test-svd0",
+        // "test-vec0",
+        "test-vec1",
+        // "test-vec2",
+        "test0",
+        "test1",
+        "test2",
+        "test3",
+    } else .{
+        // "test-blas0",
+        // "test-grad0",
+        "test-mul-mat0",
+        // "test-mul-mat1",
+        "test-mul-mat2",
+        // "test-opt",
+        // "test-svd0",
+        // "test-vec0",
+        // "test-vec1",
+        // "test-vec2",
+        "test0",
+        "test1",
+        "test2",
+        "test3",
+    };
+    inline for (tests) |name| {
+        const exe = b.addExecutable(.{
+            .name = name,
+            .target = target,
+            .optimize = optimize,
+        });
+        exe.addIncludePath(.{ .path = "./include" });
+        exe.addIncludePath(.{ .path = "./include/ggml" });
+        exe.addCSourceFiles(&.{
+            std.fmt.comptimePrint("tests/{s}.c", .{name}),
+        }, &.{"-std=c11"});
+        exe.linkLibrary(lib);
+        b.installArtifact(exe);
+        const run_cmd = b.addRunArtifact(exe);
+        run_cmd.step.dependOn(b.getInstallStep());
+        if (b.args) |args| run_cmd.addArgs(args);
+        const run_step = b.step("run_" ++ name, "Run tests");
+        run_step.dependOn(&run_cmd.step);
+    }
+
+    // zig_tests
+    const zig_tests = .{
+        "test0",
+        "test1",
+        "test2",
+        "test3",
+    };
+    inline for (zig_tests) |name| {
+        const exe = b.addExecutable(.{
+            .name = name,
+            .root_source_file = .{ .path = std.fmt.comptimePrint("tests/{s}.zig", .{name}) },
+            .target = target,
+            .optimize = optimize,
+        });
+        exe.addIncludePath(.{ .path = "./include" });
+        exe.addIncludePath(.{ .path = "./include/ggml" });
+        exe.linkLibrary(lib);
+        b.installArtifact(exe);
+        const run_cmd = b.addRunArtifact(exe);
+        run_cmd.step.dependOn(b.getInstallStep());
+        if (b.args) |args| run_cmd.addArgs(args);
+        const run_step = b.step("run_zig_" ++ name, "Run zig_tests");
+        run_step.dependOn(&run_cmd.step);
+    }
+}
diff --git a/stable-diffusion.cpp/ggml/ci/run.sh b/stable-diffusion.cpp/ggml/ci/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..15fc39e58ce0507e372dd64bca2ea6a74882e8e3
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/ci/run.sh
@@ -0,0 +1,334 @@
+#/bin/bash
+#
+# sample usage:
+#
+# mkdir tmp
+#
+# # CPU-only build
+# bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with CUDA support
+# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+
+if [ -z "$2" ]; then
+    echo "usage: $0 <output-dir> <mnt-dir>"
+    exit 1
+fi
+
+mkdir -p "$1"
+mkdir -p "$2"
+
+OUT=$(realpath "$1")
+MNT=$(realpath "$2")
+
+rm -v $OUT/*.log
+rm -v $OUT/*.exit
+rm -v $OUT/*.md
+
+sd=`dirname $0`
+cd $sd/../
+SRC=`pwd`
+
+## helpers
+
+# download a file if it does not exist or if it is outdated
+function gg_wget {
+    local out=$1
+    local url=$2
+
+    local cwd=`pwd`
+
+    mkdir -p $out
+    cd $out
+
+    # should not re-download if file is the same
+    wget -nv -N $url
+
+    cd $cwd
+}
+
+function gg_printf {
+    printf -- "$@" >> $OUT/README.md
+}
+
+function gg_run {
+    ci=$1
+
+    set -o pipefail
+    set -x
+
+    gg_run_$ci | tee $OUT/$ci.log
+    cur=$?
+    echo "$cur" > $OUT/$ci.exit
+
+    set +x
+    set +o pipefail
+
+    gg_sum_$ci
+
+    ret=$((ret | cur))
+}
+
+## ci
+
+# ctest_debug
+
+function gg_run_ctest_debug {
+    cd ${SRC}
+
+    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
+    set +e
+}
+
+function gg_sum_ctest_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
+# ctest_release
+
+function gg_run_ctest_release {
+    cd ${SRC}
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    if [ -z $GG_BUILD_LOW_PERF ]; then
+        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    else
+        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    fi
+
+    set +e
+}
+
+function gg_sum_ctest_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+
+# gpt_2
+
+function gg_run_gpt_2 {
+    cd ${SRC}
+
+    gg_wget models-mnt/gpt-2 https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin
+
+    cd build-ci-release
+
+    set -e
+
+    model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
+    prompts="../examples/prompts/gpt-2.txt"
+
+    (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -tt ${prompts}                       ) 2>&1 | tee -a $OUT/${ci}-tg.log
+    (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+
+    (time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+
+    set +e
+}
+
+function gg_sum_gpt_2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs short GPT-2 text generation\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
+    gg_printf '```\n'
+}
+
+# mnist
+
+function gg_run_mnist {
+    cd ${SRC}
+
+    cd build-ci-release
+
+    set -e
+
+    mkdir -p models/mnist
+    python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
+
+    model_f32="./models/mnist/ggml-model-f32.bin"
+    samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte"
+
+    # first command runs and exports "mnist.ggml", the second command runs the exported model
+
+    (time ./bin/mnist     ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
+    (time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
+
+    set +e
+}
+
+function gg_sum_mnist {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'MNIST\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)"
+    gg_printf '```\n'
+}
+
+# whisper
+
+function gg_run_whisper {
+    cd ${SRC}
+
+    gg_wget models-mnt/whisper/ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin
+    gg_wget models-mnt/whisper/ https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav
+
+    cd build-ci-release
+
+    set -e
+
+    path_models="../models-mnt/whisper/"
+    model_f16="${path_models}/ggml-base.en.bin"
+    audio_0="${path_models}/jfk.wav"
+
+    (time ./bin/whisper -m ${model_f16} -f ${audio_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
+
+    grep -q "And so my fellow Americans" $OUT/${ci}-main.log
+
+    set +e
+}
+
+function gg_sum_whisper {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs short Whisper transcription\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
+    gg_printf '```\n'
+}
+
+# sam
+
+function gg_run_sam {
+    cd ${SRC}
+
+    gg_wget models-mnt/sam/ https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
+    gg_wget models-mnt/sam/ https://raw.githubusercontent.com/YavorGIvanov/sam.cpp/ceafb7467bff7ec98e0c4f952e58a9eb8fd0238b/img.jpg
+
+    cd build-ci-release
+
+    set -e
+
+    path_models="../models-mnt/sam/"
+    model_f16="${path_models}/ggml-model-f16.bin"
+    img_0="${path_models}/img.jpg"
+
+    python3 ../examples/sam/convert-pth-to-ggml.py ${path_models}/sam_vit_b_01ec64.pth ${path_models}/ 1
+
+    (time ./bin/sam -m ${model_f16} -i ${img_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
+
+    grep -q "bbox (371, 436), (144, 168)" $OUT/${ci}-main.log
+
+    set +e
+}
+
+function gg_sum_sam {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Run SAM\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
+    gg_printf '```\n'
+}
+
+# mpt
+
+function gg_run_mpt {
+    cd ${SRC}
+
+    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/config.json
+    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer.json
+    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/pytorch_model.bin.index.json
+    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/configuration_mpt.py
+    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00001-of-00002.bin
+    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00002-of-00002.bin
+
+    cd build-ci-release
+
+    set -e
+
+    path_models="../models-mnt/mpt/7B"
+    model_f16="${path_models}/ggml-model-f16.bin"
+    model_q4_0="${path_models}/ggml-model-q4_0.bin"
+
+    python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1
+    ./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0
+
+    (time ./bin/mpt --model ${model_f16}  -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+    (time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+
+    set +e
+}
+
+function gg_sum_mpt {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs short MPT text generation\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
+    gg_printf '```\n'
+}
+
+## main
+
+if [ -z $GG_BUILD_LOW_PERF ]; then
+    rm -rf ${SRC}/models-mnt
+
+    mnt_models=${MNT}/models
+    mkdir -p ${mnt_models}
+    ln -sfn ${mnt_models} ${SRC}/models-mnt
+fi
+
+python3 -m pip install -r ${SRC}/requirements.txt
+
+ret=0
+
+test $ret -eq 0 && gg_run ctest_debug
+test $ret -eq 0 && gg_run ctest_release
+test $ret -eq 0 && gg_run gpt_2
+test $ret -eq 0 && gg_run mnist
+test $ret -eq 0 && gg_run whisper
+test $ret -eq 0 && gg_run sam
+
+if [ -z $GG_BUILD_LOW_PERF ]; then
+    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 16 ]; then
+        test $ret -eq 0 && gg_run mpt
+    fi
+fi
+
+exit $ret
diff --git a/stable-diffusion.cpp/ggml/cmake/BuildTypes.cmake b/stable-diffusion.cpp/ggml/cmake/BuildTypes.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a9c7b6c91ec681d6f74db0619fa93798628d2dd7
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/cmake/BuildTypes.cmake
@@ -0,0 +1,54 @@
+# Add new build types
+
+# ReleaseGG - Release with enabled asserts
+
+SET(CMAKE_CXX_FLAGS_RELEASEGG
+    "-O3"
+    CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
+    FORCE )
+SET(CMAKE_C_FLAGS_RELEASEGG
+    "-O3"
+    CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
+    FORCE )
+SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
+    ""
+    CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
+    FORCE )
+SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
+    ""
+    CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
+    FORCE )
+MARK_AS_ADVANCED(
+    CMAKE_CXX_FLAGS_RELEASEGG
+    CMAKE_C_FLAGS_RELEASEGG
+    CMAKE_EXE_LINKER_FLAGS_RELEASEGG
+    CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
+
+# RelWithDebInfoGG - RelWithDebInfo with enabled asserts
+
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
+    "-O2 -g"
+    CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
+    FORCE )
+SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
+    "-O2 -g"
+    CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
+    FORCE )
+SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
+    ""
+    CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
+    FORCE )
+SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
+    ""
+    CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
+    FORCE )
+MARK_AS_ADVANCED(
+    CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
+    CMAKE_C_FLAGS_RELWITHDEBINFOGG
+    CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
+    CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
+endif()
diff --git a/stable-diffusion.cpp/ggml/cmake/GitVars.cmake b/stable-diffusion.cpp/ggml/cmake/GitVars.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1a4c24ebf6adeb1126e626f56de601621179353d
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/cmake/GitVars.cmake
@@ -0,0 +1,22 @@
+find_package(Git)
+
+# the commit's SHA1
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_SHA1
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the date of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_DATE
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the subject of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%s
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/stable-diffusion.cpp/ggml/examples/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e3404fb8be7fa114bc69558ebd26ae5e0f87f20c
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/CMakeLists.txt
@@ -0,0 +1,30 @@
+if (GGML_ALL_WARNINGS)
+  if (NOT MSVC)
+      set(cxx_flags
+          # TODO(marella): Add other warnings.
+          -Wpedantic
+          -Wunused-variable
+          -Wno-unused-function
+          -Wno-multichar
+      )
+      add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>")
+  endif()
+endif()
+
+add_library(common STATIC common.cpp)
+target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_library(common-ggml STATIC common-ggml.cpp)
+target_link_libraries(common-ggml PRIVATE ggml)
+target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_subdirectory(gpt-2)
+add_subdirectory(gpt-j)
+add_subdirectory(whisper)
+add_subdirectory(mnist)
+add_subdirectory(gpt-neox)
+add_subdirectory(dolly-v2)
+add_subdirectory(replit)
+add_subdirectory(mpt)
+add_subdirectory(starcoder)
+add_subdirectory(sam)
diff --git a/stable-diffusion.cpp/ggml/examples/common-ggml.cpp b/stable-diffusion.cpp/ggml/examples/common-ggml.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33ae03ae10ff419dd18b336e5e7d9406afc94d36
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/common-ggml.cpp
@@ -0,0 +1,246 @@
+#include "common-ggml.h"
+
+#include <regex>
+#include <map>
+
+static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
+    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
+    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+};
+
+void ggml_print_ftypes(FILE * fp) {
+    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
+        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+    }
+}
+
+enum ggml_ftype ggml_parse_ftype(const char * str) {
+    enum ggml_ftype ftype;
+    if (str[0] == 'q') {
+        const auto it = GGML_FTYPE_MAP.find(str);
+        if (it == GGML_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
+            return GGML_FTYPE_UNKNOWN;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum ggml_ftype) atoi(str);
+    }
+
+    return ftype;
+}
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip) {
+
+    ggml_type qtype = GGML_TYPE_F32;
+
+    switch (ftype) {
+        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_UNKNOWN:
+        case GGML_FTYPE_ALL_F32:
+        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        case GGML_FTYPE_MOSTLY_Q2_K:
+        case GGML_FTYPE_MOSTLY_Q3_K:
+        case GGML_FTYPE_MOSTLY_Q4_K:
+        case GGML_FTYPE_MOSTLY_Q5_K:
+        case GGML_FTYPE_MOSTLY_Q6_K:
+                {
+                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+                    return false;
+                }
+    };
+
+    if (!ggml_is_quantized(qtype)) {
+        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
+        return false;
+    }
+
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<float> work;
+
+    std::vector<uint8_t>     data_u8;
+    std::vector<ggml_fp16_t> data_f16;
+    std::vector<float>       data_f32;
+
+    std::vector<int64_t> hist_all(1 << 4, 0);
+
+    while (true) {
+        int32_t n_dims;
+        int32_t length;
+        int32_t ttype;
+
+        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
+        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+        if (finp.eof()) {
+            break;
+        }
+
+        int32_t nelements = 1;
+        int32_t ne[4] = { 1, 1, 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+            nelements *= ne[i];
+        }
+
+        std::string name(length, 0);
+        finp.read (&name[0], length);
+
+        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
+
+        bool quantize = false;
+
+        // check if we should quantize this tensor
+        for (const auto & s : to_quant) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // check if we should skip this tensor
+        for (const auto & s : to_skip) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = false;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (n_dims == 2);
+
+        if (quantize) {
+            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                return false;
+            }
+
+            if (ttype == GGML_TYPE_F16) {
+                data_f16.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                data_f32.resize(nelements);
+                for (int i = 0; i < nelements; ++i) {
+                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                }
+            } else {
+                data_f32.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+            }
+
+            ttype = qtype;
+        } else {
+            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+            data_u8.resize(nelements*bpe);
+            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
+        }
+
+        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
+        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+        for (int i = 0; i < n_dims; ++i) {
+            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+        }
+        fout.write(&name[0], length);
+
+        if (quantize) {
+            work.resize(nelements); // for quantization
+
+            size_t cur_size = 0;
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch ((ggml_type) ttype) {
+                case GGML_TYPE_Q4_0:
+                    {
+                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q4_1:
+                    {
+                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_I8:
+                case GGML_TYPE_I16:
+                case GGML_TYPE_I32:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
+                case GGML_TYPE_Q8_K:
+                case GGML_TYPE_COUNT:
+                    {
+                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                        return false;
+                    }
+            }
+
+            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
+            total_size_new += cur_size;
+
+            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                hist_all[i] += hist_cur[i];
+            }
+
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                printf("%5.3f ", hist_cur[i] / (float)nelements);
+            }
+            printf("\n");
+        } else {
+            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+            total_size_new += data_u8.size();
+        }
+
+        total_size_org += nelements * sizeof(float);
+    }
+
+    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
+
+    {
+        int64_t sum_all = 0;
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/common-ggml.h b/stable-diffusion.cpp/ggml/examples/common-ggml.h
new file mode 100644
index 0000000000000000000000000000000000000000..477de341a1faa99344d05efd968dc58d434b6e24
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/common-ggml.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <fstream>
+#include <vector>
+#include <string>
+
+enum ggml_ftype ggml_parse_ftype(const char * str);
+
+void ggml_print_ftypes(FILE * fp = stderr);
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip);
diff --git a/stable-diffusion.cpp/ggml/examples/common.cpp b/stable-diffusion.cpp/ggml/examples/common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..603c655a184c745c0332351d729c2530afbe5772
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/common.cpp
@@ -0,0 +1,817 @@
+#define _USE_MATH_DEFINES // for M_PI
+
+#include "common.h"
+
+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <regex>
+#include <locale>
+#include <codecvt>
+#include <sstream>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// Function to check if the next argument exists
+std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
+    if (i + 1 < argc && argv[i + 1][0] != '-') {
+        return argv[++i];
+    } else {
+        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
+        gpt_print_usage(argc, argv, params);
+        exit(0);
+    }
+}
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-p" || arg == "--prompt") {
+            params.prompt = get_next_arg(i, argc, argv, arg, params);
+        } else if (arg == "-n" || arg == "--n_predict") {
+            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-np" || arg == "--n_parallel") {
+            params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "--top_k") {
+            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "--top_p") {
+            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "--temp") {
+            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "--repeat-last-n") {
+            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "--repeat-penalty") {
+            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-b" || arg == "--batch_size") {
+            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-c" || arg == "--context") {
+            params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
+            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "--ignore-eos") {
+            params.ignore_eos = true;
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = get_next_arg(i, argc, argv, arg, params);
+        } else if (arg == "-i" || arg == "--interactive") {
+            params.interactive = true;
+        } else if (arg == "-ip" || arg == "--interactive-port") {
+            params.interactive = true;
+            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-h" || arg == "--help") {
+            gpt_print_usage(argc, argv, params);
+            exit(0);
+        } else if (arg == "-f" || arg == "--file") {
+            get_next_arg(i, argc, argv, arg, params);
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        } else if (arg == "-tt" || arg == "--token_test") {
+            params.token_test = get_next_arg(i, argc, argv, arg, params);
+        }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            gpt_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stderr, "                        load prompt from a file\n");
+    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
+    fprintf(stderr, "                        test tokenization\n");
+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
+    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  -c N, --context N     context / KV cache size (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  --ignore-eos          ignore EOS token during generation\n");
+    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "\n");
+}
+
+std::string gpt_random_prompt(std::mt19937 & rng) {
+    const int r = rng() % 10;
+    switch (r) {
+        case 0: return "So";
+        case 1: return "Once upon a time";
+        case 2: return "When";
+        case 3: return "The";
+        case 4: return "After";
+        case 5: return "If";
+        case 6: return "import";
+        case 7: return "He";
+        case 8: return "She";
+        case 9: return "They";
+        default: return "To";
+    }
+
+    return "The";
+}
+
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+std::string replace(const std::string & s, const std::string & from, const std::string & to) {
+    std::string result = s;
+    size_t pos = 0;
+    while ((pos = result.find(from, pos)) != std::string::npos) {
+        result.replace(pos, from.length(), to);
+        pos += to.length();
+    }
+    return result;
+}
+
+void gpt_vocab::add_special_token(const std::string & token) {
+    special_tokens.push_back(token);
+}
+
+std::map<std::string, int32_t> json_parse(const std::string & fname) {
+    std::map<std::string, int32_t> result;
+
+    // read file into string
+    std::string json;
+    {
+        std::ifstream ifs(fname);
+        if (!ifs) {
+            fprintf(stderr, "Failed to open %s\n", fname.c_str());
+            exit(1);
+        }
+
+        json = std::string((std::istreambuf_iterator<char>(ifs)),
+                (std::istreambuf_iterator<char>()));
+    }
+
+    if (json[0] != '{') {
+        return result;
+    }
+
+    // parse json
+    {
+        bool has_key  = false;
+        bool in_token = false;
+
+        std::string str_key = "";
+        std::string str_val = "";
+
+        int n = json.size();
+        for (int i = 1; i < n; ++i) {
+            if (!in_token) {
+                if (json[i] == ' ') continue;
+                if (json[i] == '"') {
+                    in_token = true;
+                    continue;
+                }
+            } else {
+                if (json[i] == '\\' && i+1 < n) {
+                    if (has_key == false) {
+                        str_key += json[i];
+                    } else {
+                        str_val += json[i];
+                    }
+                    ++i;
+                } else if (json[i] == '"') {
+                    if (has_key == false) {
+                        has_key = true;
+                        ++i;
+                        while (json[i] == ' ') ++i;
+                        ++i; // :
+                        while (json[i] == ' ') ++i;
+                        if (json[i] != '\"') {
+                            while (json[i] != ',' && json[i] != '}') {
+                                str_val += json[i++];
+                            }
+                            has_key = false;
+                        } else {
+                            in_token = true;
+                            continue;
+                        }
+                    } else {
+                        has_key = false;
+                    }
+
+                    str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
+                    str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
+                    str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
+
+                    try {
+                        result[str_key] = std::stoi(str_val);
+                    } catch (...) {
+                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
+
+                    }
+                    str_key = "";
+                    str_val = "";
+                    in_token = false;
+                    continue;
+                }
+                if (has_key == false) {
+                    str_key += json[i];
+                } else {
+                    str_val += json[i];
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+std::string convert_to_utf8(const std::wstring & input) {
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    return converter.to_bytes(input);
+}
+
+
+std::wstring convert_to_wstring(const std::string & input) {
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    return converter.from_bytes(input);
+}
+
+void gpt_split_words(std::string str, std::vector<std::string>& words) {
+    const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+    const std::regex re(pattern);
+    std::smatch m;
+
+    while (std::regex_search(str, m, re)) {
+        for (auto x : m) {
+            words.push_back(x);
+        }
+        str = m.suffix();
+    }
+}
+
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        std::string str = text;
+
+        // Generate the subpattern from the special_tokens vector if it's not empty
+        if (!vocab.special_tokens.empty()) {
+            const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
+            std::string special_tokens_subpattern;
+            for (const auto & token : vocab.special_tokens) {
+                if (!special_tokens_subpattern.empty()) {
+                    special_tokens_subpattern += "|";
+                }
+                special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
+            }
+
+            std::regex re(special_tokens_subpattern);
+            std::smatch m;
+            // Split the text by special tokens.
+            while (std::regex_search(str, m, re)) {
+                // Split the substrings in-between special tokens into words.
+                gpt_split_words(m.prefix(), words);
+                // Add matched special tokens as words.
+                for (auto x : m) {
+                    words.push_back(x);
+                }
+                str = m.suffix();
+            }
+            // Remaining text without special tokens will be handled below.
+        }
+
+        gpt_split_words(str, words);
+    }
+
+    // find the longest token that forms each word in words:
+    std::vector<gpt_vocab::id> tokens;
+    for (const auto & word : words) {
+        for (int i = 0; i < (int) word.size(); ){
+            for (int j = word.size() - 1; j >= i; j--){
+                auto cand = word.substr(i, j-i+1);
+                auto it = vocab.token_to_id.find(cand);
+                if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
+                    tokens.push_back(it->second);
+                    i = j + 1;
+                    break;
+                }
+                else if (j == i){ // word.substr(i, 1) has no matching
+                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
+                    i++;
+                }
+            }
+        }
+    }
+
+    return tokens;
+}
+
+std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
+    std::vector<gpt_vocab::id> output;
+    std::stringstream ss(input);
+    std::string token;
+
+    while (std::getline(ss, token, delimiter)) {
+        output.push_back(std::stoi(token));
+    }
+
+    return output;
+}
+
+std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
+    if (fpath_test.empty()){
+        fprintf(stderr, "%s : No test file found.\n", __func__);
+        return std::map<std::string, std::vector<gpt_vocab::id>>();
+    }
+
+    std::map<std::string, std::vector<gpt_vocab::id>> tests;
+
+    auto fin = std::ifstream(fpath_test, std::ios_base::in);
+    const char * delimeter = " => ";
+    const char del_tok = ',';
+    std::string line;
+    while (std::getline(fin, line)) {
+        size_t delimiterPos = line.find(delimeter);
+        if (delimiterPos != std::string::npos) {
+            std::string text = line.substr(0, delimiterPos);
+            std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
+            tests[text] = parse_tokens_from_string(s_tokens, del_tok);
+        }
+    }
+    return tests;
+}
+
+void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
+    std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
+
+    size_t n_fails = 0;
+
+    for (const auto & test : tests) {
+        std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
+
+        if (tokens != test.second){
+            n_fails++;
+
+            // print out failure cases
+            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
+            fprintf(stderr, "%s : tokens in hf:   ", __func__);
+            for (const auto & t : test.second) {
+                fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
+            }
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s : tokens in ggml: ", __func__);
+            for (const auto & t : tokens) {
+                fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
+            }
+            fprintf(stderr, "\n");
+        }
+    }
+
+    fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
+}
+
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
+    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
+
+    vocab.token_to_id = ::json_parse(fname);
+
+    for (const auto & kv : vocab.token_to_id) {
+        vocab.id_to_token[kv.second] = kv.first;
+    }
+
+    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
+
+    // print the vocabulary
+    //for (auto kv : vocab.token_to_id) {
+    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
+    //}
+
+    return true;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng) {
+    int n_logits = vocab.id_to_token.size();
+
+    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+    logits_id.reserve(n_logits);
+
+    {
+        const double scale = 1.0/temp;
+        for (int i = 0; i < n_logits; ++i) {
+            logits_id.push_back(std::make_pair(logits[i]*scale, i));
+        }
+    }
+
+    // find the top K tokens
+    std::partial_sort(
+            logits_id.begin(),
+            logits_id.begin() + top_k, logits_id.end(),
+            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
+        return a.first > b.first;
+    });
+
+    logits_id.resize(top_k);
+
+    double maxl = -INFINITY;
+    for (const auto & kv : logits_id) {
+        maxl = std::max(maxl, kv.first);
+    }
+
+    // compute probs for the top K tokens
+    std::vector<double> probs;
+    probs.reserve(logits_id.size());
+
+    double sum = 0.0;
+    for (const auto & kv : logits_id) {
+        double p = exp(kv.first - maxl);
+        probs.push_back(p);
+        sum += p;
+    }
+
+    // normalize the probs
+    for (auto & p : probs) {
+        p /= sum;
+    }
+
+    if (top_p < 1.0f) {
+        double cumsum = 0.0f;
+        for (int i = 0; i < top_k; i++) {
+            cumsum += probs[i];
+            if (cumsum >= top_p) {
+                top_k = i + 1;
+                probs.resize(top_k);
+                logits_id.resize(top_k);
+                break;
+            }
+        }
+
+        cumsum = 1.0/cumsum;
+        for (int i = 0; i < (int) probs.size(); i++) {
+            probs[i] *= cumsum;
+        }
+    }
+
+    //printf("\n");
+    //for (int i = 0; i < (int) probs.size(); i++) {
+    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+    //}
+    //exit(0);
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    return logits_id[idx].second;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p_repeat(
+        const gpt_vocab & vocab,
+        const float * logits,
+        const int32_t * last_n_tokens_data,
+        size_t last_n_tokens_data_size,
+        int    top_k,
+        double top_p,
+        double temp,
+        int repeat_last_n,
+        float repeat_penalty,
+        std::mt19937 & rng) {
+
+    int n_logits = vocab.id_to_token.size();
+
+    const auto * plogits = logits;
+
+    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
+
+    if (temp <= 0) {
+        // select the token with the highest logit directly
+        float max_logit = plogits[0];
+        gpt_vocab::id max_id = 0;
+
+        for (int i = 1; i < n_logits; ++i) {
+            if (plogits[i] > max_logit) {
+                max_logit = plogits[i];
+                max_id = i;
+            }
+        }
+        return max_id;
+    }
+
+
+    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+    logits_id.reserve(n_logits);
+
+    {
+        const float scale = 1.0f/temp;
+        for (int i = 0; i < n_logits; ++i) {
+            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
+            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
+            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
+                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+                if (plogits[i] < 0.0f) {
+                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
+                } else {
+                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
+                }
+            } else {
+                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
+            }
+        }
+    }
+
+    // find the top K tokens
+    std::partial_sort(
+            logits_id.begin(),
+            logits_id.begin() + top_k, logits_id.end(),
+            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
+        return a.first > b.first;
+    });
+
+    logits_id.resize(top_k);
+
+    double maxl = -INFINITY;
+    for (const auto & kv : logits_id) {
+        maxl = std::max(maxl, kv.first);
+    }
+
+    // compute probs for the top K tokens
+    std::vector<double> probs;
+    probs.reserve(logits_id.size());
+
+    double sum = 0.0;
+    for (const auto & kv : logits_id) {
+        double p = exp(kv.first - maxl);
+        probs.push_back(p);
+        sum += p;
+    }
+
+    // normalize the probs
+    for (auto & p : probs) {
+        p /= sum;
+    }
+
+    if (top_p < 1.0f) {
+        double cumsum = 0.0f;
+        for (int i = 0; i < top_k; i++) {
+            cumsum += probs[i];
+            if (cumsum >= top_p) {
+                top_k = i + 1;
+                probs.resize(top_k);
+                logits_id.resize(top_k);
+                break;
+            }
+        }
+
+        cumsum = 1.0/cumsum;
+        for (int i = 0; i < (int) probs.size(); i++) {
+            probs[i] *= cumsum;
+        }
+    }
+
+//    printf("\n");
+//    for (int i = 0; i < (int) probs.size(); i++) {
+//    for (int i = 0; i < 10; i++) {
+//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+//    }
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    return logits_id[idx].second;
+
+}
+
+bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
+    drwav wav;
+    std::vector<uint8_t> wav_data; // used for pipe input from stdin
+
+    if (fname == "-") {
+        {
+            uint8_t buf[1024];
+            while (true)
+            {
+                const size_t n = fread(buf, 1, sizeof(buf), stdin);
+                if (n == 0) {
+                    break;
+                }
+                wav_data.insert(wav_data.end(), buf, buf + n);
+            }
+        }
+
+        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+            fprintf(stderr, "error: failed to open WAV file from stdin\n");
+            return false;
+        }
+
+        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+    }
+    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
+        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
+        return false;
+    }
+
+    if (wav.channels != 1 && wav.channels != 2) {
+        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
+        return false;
+    }
+
+    if (stereo && wav.channels != 2) {
+        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
+        return false;
+    }
+
+    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
+        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
+        return false;
+    }
+
+    if (wav.bitsPerSample != 16) {
+        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
+        return false;
+    }
+
+    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
+
+    std::vector<int16_t> pcm16;
+    pcm16.resize(n*wav.channels);
+    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+    drwav_uninit(&wav);
+
+    // convert to mono, float
+    pcmf32.resize(n);
+    if (wav.channels == 1) {
+        for (uint64_t i = 0; i < n; i++) {
+            pcmf32[i] = float(pcm16[i])/32768.0f;
+        }
+    } else {
+        for (uint64_t i = 0; i < n; i++) {
+            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+        }
+    }
+
+    if (stereo) {
+        // convert to stereo, float
+        pcmf32s.resize(2);
+
+        pcmf32s[0].resize(n);
+        pcmf32s[1].resize(n);
+        for (uint64_t i = 0; i < n; i++) {
+            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+        }
+    }
+
+    return true;
+}
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
+float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
+
+bool sam_params_parse(int argc, char ** argv, sam_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-i" || arg == "--inp") {
+            params.fname_inp = argv[++i];
+        } else if (arg == "-o" || arg == "--out") {
+            params.fname_out = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
+    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
+    fprintf(stderr, "  -o FNAME, --out FNAME\n");
+    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
+    fprintf(stderr, "\n");
+}
diff --git a/stable-diffusion.cpp/ggml/examples/common.h b/stable-diffusion.cpp/ggml/examples/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d4e9c37c9f50a486497bfb1ad24557b138ffc35
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/common.h
@@ -0,0 +1,179 @@
+// Various helper functions and utilities
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <vector>
+#include <random>
+#include <thread>
+
+#define COMMON_SAMPLE_RATE 16000
+
+//
+// GPT CLI argument parsing
+//
+
+struct gpt_params {
+    int32_t seed         = -1;   // RNG seed
+    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict    = 200;  // new tokens to predict
+    int32_t n_parallel   = 1;    // number of parallel streams
+    int32_t n_batch      = 8;    // batch size for prompt processing
+    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
+    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU
+
+    bool ignore_eos = false; // ignore EOS token when generating text
+
+    // sampling parameters
+    int32_t top_k          = 40;
+    float   top_p          = 0.9f;
+    float   temp           = 0.9f;
+    int32_t repeat_last_n  = 64;
+    float   repeat_penalty = 1.00f;
+
+    std::string model      = "models/gpt-2-117M/ggml-model.bin"; // model path
+    std::string prompt     = "";
+    std::string token_test = "";
+
+    bool    interactive      = false;
+    int32_t interactive_port = -1;
+};
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+//
+// Vocab utils
+//
+
+std::string trim(const std::string & s);
+
+std::string replace(
+        const std::string & s,
+        const std::string & from,
+        const std::string & to);
+
+struct gpt_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+    std::vector<std::string> special_tokens;
+
+    void add_special_token(const std::string & token);
+};
+
+// poor-man's JSON parsing
+std::map<std::string, int32_t> json_parse(const std::string & fname);
+
+std::string convert_to_utf8(const std::wstring & input);
+
+std::wstring convert_to_wstring(const std::string & input);
+
+void gpt_split_words(std::string str, std::vector<std::string>& words);
+
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+
+// test outputs of gpt_tokenize
+//
+//   - compare with tokens generated by the huggingface tokenizer
+//   - test cases are chosen based on the model's main language (under 'prompt' directory)
+//   - if all sentences are tokenized identically, print 'All tests passed.'
+//   - otherwise, print sentence, huggingface tokens, ggml tokens
+//
+void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
+
+// load the tokens from encoder.json
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
+
+// sample next token given probabilities for each embedding
+//
+//   - consider only the top K tokens
+//   - from them, consider only the top tokens with cumulative probability > P
+//
+// TODO: not sure if this implementation is correct
+// TODO: temperature is not implemented
+//
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng);
+
+gpt_vocab::id gpt_sample_top_k_top_p_repeat(
+        const gpt_vocab & vocab,
+        const float * logits,
+        const int32_t * last_n_tokens_data,
+        size_t last_n_tokens_data_size,
+        int    top_k,
+        double top_p,
+        double temp,
+        int repeat_last_n,
+        float repeat_penalty,
+        std::mt19937 & rng);
+
+//
+// Audio utils
+//
+
+// Read WAV audio file and store the PCM data into pcmf32
+// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
+// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
+bool read_wav(
+        const std::string & fname,
+        std::vector<float> & pcmf32,
+        std::vector<std::vector<float>> & pcmf32s,
+        bool stereo);
+
+// Apply a high-pass frequency filter to PCM audio
+// Suppresses frequencies below cutoff Hz
+void high_pass_filter(
+        std::vector<float> & data,
+        float cutoff,
+        float sample_rate);
+
+// Basic voice activity detection (VAD) using audio energy adaptive threshold
+bool vad_simple(
+        std::vector<float> & pcmf32,
+        int   sample_rate,
+        int   last_ms,
+        float vad_thold,
+        float freq_thold,
+        bool  verbose);
+
+// compute similarity between two strings using Levenshtein distance
+float similarity(const std::string & s0, const std::string & s1);
+
+//
+// SAM argument parsing
+//
+
+struct sam_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+
+    std::string model     = "models/sam-vit-b/ggml-model-f16.bin"; // model path
+    std::string fname_inp = "img.jpg";
+    std::string fname_out = "img.out";
+};
+
+bool sam_params_parse(int argc, char ** argv, sam_params & params);
+
+void sam_print_usage(int argc, char ** argv, const sam_params & params);
diff --git a/stable-diffusion.cpp/ggml/examples/dolly-v2/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/dolly-v2/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b2d55563771884e87e2ce8bea775eb700ef2d6c7
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/dolly-v2/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# dollyv2
+
+set(TEST_TARGET dollyv2)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# dollyv2-quantize
+
+set(TEST_TARGET dollyv2-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/stable-diffusion.cpp/ggml/examples/dolly-v2/README.md b/stable-diffusion.cpp/ggml/examples/dolly-v2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..add97385a31b5e857189f4e6d0bfde609d25717e
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/dolly-v2/README.md
@@ -0,0 +1,187 @@
+# Dolly-V2
+
+Transformer architecture: GPT-NeoX
+
+Modeled from examples/stablelm
+
+Ref: https://github.com/databrickslabs/dolly
+
+Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
+
+## Usage
+
+```bash
+# get the repo and build it
+git clone https://github.com/ggerganov/ggml
+cd ggml
+mkdir build && cd build
+cmake ..
+make -j
+
+# get the Dolly-V2 3B model
+git clone https://huggingface.co/databricks/dolly-v2-3b
+
+# install Python dependencies
+python3 -m pip install -r ../requirements.txt
+
+# convert model to FP16
+python3 ../examples/dolly-v2/convert-h5-to-ggml.py ./dolly-v2-3b/ 1
+
+# run inference using FP16 precision
+./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-f16.bin -p "State the meaning of life." -t 6 -n 64
+
+main: seed = 1683218142
+dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-f16.bin' - please wait ...
+dollyv2_model_load: n_vocab = 50280
+dollyv2_model_load: n_ctx   = 2048
+dollyv2_model_load: n_embd  = 2560
+dollyv2_model_load: n_head  = 32
+dollyv2_model_load: n_layer = 32
+dollyv2_model_load: n_rot   = 20
+dollyv2_model_load: ftype   = 1
+dollyv2_model_load: ggml ctx size = 7374.91 MB
+dollyv2_model_load: memory_size =   640.00 MB, n_mem = 65536
+dollyv2_model_load: ................................................ done
+dollyv2_model_load: model size =  5295.10 MB / num tensors = 388
+main: number of tokens in prompt = 32
+main: token[0] =  30003, Below
+main: token[1] =    310,  is
+main: token[2] =    271,  an
+main: token[3] =   9775,  instruction
+main: token[4] =    326,  that
+main: token[5] =   8631,  describes
+main: token[6] =    247,  a
+main: token[7] =   4836,  task
+main: token[8] =    964, .
+main: token[9] =  19566,  Write
+main: token[10] =    247,  a
+main: token[11] =   2380,  response
+main: token[12] =    326,  that
+main: token[13] =  20420,  appropriately
+main: token[14] =  29141,  completes
+main: token[15] =    253,  the
+main: token[16] =   2748,  request
+main: token[17] =    964, .
+main: token[18] =    187, 
+
+main: token[19] =    187, 
+
+main: token[20] =  50278, ### Instruction:
+main: token[21] =    187, 
+
+main: token[22] =   5443, State
+main: token[23] =    253,  the
+main: token[24] =   4495,  meaning
+main: token[25] =    273,  of
+main: token[26] =   1495,  life
+main: token[27] =    964, .
+main: token[28] =    187, 
+
+main: token[29] =    187, 
+
+main: token[30] =  50279, ### Response:
+main: token[31] =    187, 
+
+
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+State the meaning of life.
+
+### Response:
+The meaning of life is to love and be loved.
+
+### End
+
+main: mem per token = 16136720 bytes
+main:     load time =  2202.58 ms
+main:   sample time =     2.57 ms
+main:  predict time =  1497.14 ms / 33.27 ms per token
+main:    total time =  6187.27 ms
+```
+
+## 5-bit integer quantization mode
+
+```bash
+# quantize the model to 5-bits using Q5_0 quantization
+./bin/dollyv2-quantize ./dolly-v2-3b/ggml-model-f16.bin ./dolly-v2-3b/ggml-model-q5_0.bin q5_0
+
+# run the quantized model
+./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-q5_0.bin -p "State the meaning of life." -t 6 -n 64
+
+main: seed = 1683218518
+dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-q5_0.bin' - please wait ...
+dollyv2_model_load: n_vocab = 50280
+dollyv2_model_load: n_ctx   = 2048
+dollyv2_model_load: n_embd  = 2560
+dollyv2_model_load: n_head  = 32
+dollyv2_model_load: n_layer = 32
+dollyv2_model_load: n_rot   = 20
+dollyv2_model_load: ftype   = 8
+dollyv2_model_load: ggml ctx size = 3902.68 MB
+dollyv2_model_load: memory_size =   640.00 MB, n_mem = 65536
+dollyv2_model_load: ................................................ done
+dollyv2_model_load: model size =  1822.87 MB / num tensors = 388
+main: number of tokens in prompt = 32
+main: token[0] =  30003, Below
+main: token[1] =    310,  is
+main: token[2] =    271,  an
+main: token[3] =   9775,  instruction
+main: token[4] =    326,  that
+main: token[5] =   8631,  describes
+main: token[6] =    247,  a
+main: token[7] =   4836,  task
+main: token[8] =    964, .
+main: token[9] =  19566,  Write
+main: token[10] =    247,  a
+main: token[11] =   2380,  response
+main: token[12] =    326,  that
+main: token[13] =  20420,  appropriately
+main: token[14] =  29141,  completes
+main: token[15] =    253,  the
+main: token[16] =   2748,  request
+main: token[17] =    964, .
+main: token[18] =    187, 
+
+main: token[19] =    187, 
+
+main: token[20] =  50278, ### Instruction:
+main: token[21] =    187, 
+
+main: token[22] =   5443, State
+main: token[23] =    253,  the
+main: token[24] =   4495,  meaning
+main: token[25] =    273,  of
+main: token[26] =   1495,  life
+main: token[27] =    964, .
+main: token[28] =    187, 
+
+main: token[29] =    187, 
+
+main: token[30] =  50279, ### Response:
+main: token[31] =    187, 
+
+
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+State the meaning of life.
+
+### Response:
+The meaning of life is the discovery of the true self.
+
+### End
+
+main: mem per token = 16127760 bytes
+main:     load time =  1011.09 ms
+main:   sample time =     2.79 ms
+main:  predict time =  1271.62 ms / 27.64 ms per token
+main:    total time =  2802.51 ms
+```
+
+## Notes
+
+- No guarantees for correctness
+- The tokenizer is currently hacked - probably works only for English
+- Non-parallel residual is not supported
+- Contributions and improvements are welcome
diff --git a/stable-diffusion.cpp/ggml/examples/dolly-v2/convert-h5-to-ggml.py b/stable-diffusion.cpp/ggml/examples/dolly-v2/convert-h5-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..0019810e28e1ff2a7ca7ad2795e4fb1e2eb41a1a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/dolly-v2/convert-h5-to-ggml.py
@@ -0,0 +1,116 @@
+import sys
+import struct
+import json
+import numpy as np
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+
+
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
+#print (model)
+
+#print(tokenizer.encode('I believe the meaning of life is'))
+
+list_vars = model.state_dict()
+for name in list_vars.keys():
+    print(name, list_vars[name].shape, list_vars[name].dtype)
+
+fout = open(fname_out, "wb")
+
+print(hparams)
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+fout.write(struct.pack("i", hparams["hidden_size"]))
+fout.write(struct.pack("i", hparams["num_attention_heads"]))
+fout.write(struct.pack("i", hparams["num_hidden_layers"]))
+fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
+fout.write(struct.pack("i", hparams["use_parallel_residual"]))
+fout.write(struct.pack("i", ftype))
+
+# TODO: temporary hack to not deal with implementing the tokenizer
+dot_token = tokenizer.encode('.')[0]
+for i in range(hparams["vocab_size"]):
+    text = tokenizer.decode([dot_token, i]).encode('utf-8')
+    # remove the first byte (it's always '.')
+    text = text[1:]
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # we don't need these
+    if name.endswith(".attention.masked_bias") or     \
+       name.endswith(".attention.bias") or \
+       name.endswith(".attention.rotary_emb.inv_freq"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape);
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0;
+    if ftype != 0:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/dolly-v2/main.cpp b/stable-diffusion.cpp/ggml/examples/dolly-v2/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1a10880bb551336a49389714e47865212215deb5
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/dolly-v2/main.cpp
@@ -0,0 +1,969 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <cinttypes>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#if !defined(_WIN32)
+#define DOLLY_INTERACTIVE_PORT
+#endif
+
+#if defined(DOLLY_INTERACTIVE_PORT)
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// default hparams (Dolly-V2 3B)
+struct dollyv2_hparams {
+    int32_t n_vocab = 50254; // tokenizer.vocab_size
+    int32_t n_ctx   = 2048;  // model.config.max_position_embeddings
+    int32_t n_embd  = 2560;  // model.config.hidden_size
+    int32_t n_head  = 32;    // model.config.num_attention_heads
+    int32_t n_layer = 32;    // model.config.num_hidden_layers
+    int32_t n_rot   = 20;    // rotary_pct[25%] * (n_embd / n_head)
+    int32_t par_res = 1; // 1 = true, 0 = false
+    int32_t ftype   = GGML_FTYPE_MOSTLY_F16;
+    float   eps     = 1e-5f;
+};
+
+const std::string INSTRUCTION_KEY = "### Instruction:";
+const std::string RESPONSE_KEY    = "### Response:";
+const std::string END_KEY         = "### End";
+const std::string INTRO_BLURB     = "Below is an instruction that describes a task. Write a response that appropriately completes the request.";
+
+// dollyv2 prompt format
+std::string prompt_for_generation(const std::string& instruction) {
+    return INTRO_BLURB + "\n\n" + INSTRUCTION_KEY + "\n" + instruction + "\n\n" + RESPONSE_KEY + "\n";
+}
+
+struct dollyv2_layer {
+    // pre normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // post normalization
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // ff
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct dollyv2_model {
+    dollyv2_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte; // position embedding
+
+    struct ggml_tensor * lmh_g; // language model head
+    //struct ggml_tensor * lmh_b; // language model bias
+
+    std::vector<dollyv2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// load the model's weights from a file
+bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vocab & vocab) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        printf("%s: par_res = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+
+        vocab.add_special_token("### End");
+        vocab.add_special_token("### Instruction:");
+        vocab.add_special_token("### Response:");
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);           // lmh_g
+        //ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 16*n_layer)*512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
+
+        // map by name
+        model.tensors["gpt_neox.embed_in.weight"] = model.wte;
+
+        model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
+        model.tensors["gpt_neox.final_layer_norm.bias"]   = model.ln_f_b;
+
+        model.tensors["embed_out.weight"] = model.lmh_g;
+        //model.tensors["lm_head.bias"]   = model.lmh_b;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+
+            // unmapped: attention.rotary_emb, mlp.act
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"]   = layer.ln_1_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"]   = layer.c_attn_attn_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"]   = layer.c_attn_proj_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"]   = layer.ln_2_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"]   = layer.c_mlp_fc_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"]   = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int64_t n_mem      = n_layer*n_ctx;
+        const int64_t n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// feed-forward network
+ggml_tensor * gpt_neox_ff(
+        const dollyv2_layer & layer,
+        ggml_context        * ctx0,
+        ggml_tensor         * inp,
+        float                 eps) {
+    ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
+
+    cur = ggml_add(ctx0,
+        ggml_mul(ctx0,
+            ggml_repeat(ctx0, layer.ln_2_g, cur),
+            cur),
+        ggml_repeat(ctx0, layer.ln_2_b, cur));
+
+    cur = ggml_mul_mat(ctx0,
+            layer.c_mlp_fc_w,
+            cur);
+
+    cur = ggml_add(ctx0,
+            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
+            cur);
+
+    // GELU activation
+    cur = ggml_gelu(ctx0, cur);
+
+    // projection
+    // cur = proj_w*cur + proj_b
+    cur = ggml_mul_mat(ctx0,
+            layer.c_mlp_proj_w,
+            cur);
+
+    cur = ggml_add(ctx0,
+            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
+            cur);
+    return cur;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool dollyv2_eval(
+        const dollyv2_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+    const int n_rot   = hparams.n_rot;
+
+    static size_t buf_size = 256u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = { };
+
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    int * data = (int *) KQ_pos->data;
+    for (int i = 0; i < N; ++i) {
+        data[i] = n_past + i;
+    }
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    // wte
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // self-attention
+        {
+            {
+                cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                            cur),
+                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+            }
+
+            // compute QKV
+            {
+                cur = ggml_mul_mat(ctx0,
+                        model.layers[il].c_attn_attn_w,
+                        cur);
+
+                cur = ggml_add(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                        cur);
+            }
+
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
+
+            // using mode = 2 for GPT-NeoX mode
+            Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0);
+            Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0);
+
+            // store key and value to memory
+            {
+                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection
+            {
+                cur = ggml_mul_mat(ctx0,
+                        model.layers[il].c_attn_proj_w,
+                        cur);
+
+                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
+            }
+        }
+
+        if (hparams.par_res == 0) {
+            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
+
+            cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
+
+            // input for next layer
+            inpL = ggml_add(ctx0, cur, inpFF);
+        } else {
+            struct ggml_tensor * inpFF = cur;
+
+            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
+            // note here we pass inpL instead of cur
+            cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
+
+            // layer input + FF
+            cur  = ggml_add(ctx0, cur, inpFF);
+
+            // input for next layer
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
+
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    // lm_head
+    {
+        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+
+        //inpL = ggml_add(ctx0,
+        //        ggml_repeat(ctx0, model.lmh_b, inpL),
+        //        inpL);
+    }
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result for just the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+std::string execute_prompt(
+        const dollyv2_model &model,
+        gpt_vocab &vocab,
+        const std::string &prompt,
+        gpt_params &params,
+        std::mt19937 &rng,
+        int64_t t_load_us,
+        int64_t t_sample_us,
+        int64_t t_predict_us,
+        size_t mem_per_token,
+        int n_past,
+        bool stream_response_to_cout = false) {
+    std::string output = "";
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int)embd_inp.size());
+
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    for (size_t i = 0; i < embd_inp.size(); i++) {
+        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+    }
+    printf("\n");
+
+    std::vector<gpt_vocab::id> embd;
+
+    dollyv2_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
+
+    const int32_t end_token = vocab.token_to_id["### End"];
+
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!dollyv2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+                printf("Failed to predict\n");
+                return output;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) > params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            output += vocab.id_to_token[id];
+            if (stream_response_to_cout) {
+                printf("%s", vocab.id_to_token[id].c_str());
+            }
+        }
+        if (stream_response_to_cout) {
+            fflush(stdout);
+        }
+
+        // end of text token
+        if (embd.back() == 0 || (end_token > 0 && embd.back() == end_token)) {
+            return output;
+        }
+    }
+    return output;
+}
+
+#if defined(DOLLY_INTERACTIVE_PORT)
+int setup_port(const int port) {
+    int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    if (sockfd < 0) {
+        fprintf(stderr, "%s: Failed to create new socket\n", __func__);
+        return -1;
+    }
+
+    sockaddr_in servaddr;
+    std::memset(&servaddr, 0, sizeof(servaddr));
+
+    servaddr.sin_family = AF_INET;
+    servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
+    servaddr.sin_port = htons(port);
+
+    if (bind(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) {
+        fprintf(stderr, "%s: Failed to bind to port %i\n", __func__, port);
+        return -1;
+    }
+
+    if (listen(sockfd, 10) < 0) {
+        fprintf(stderr, "%s: Failed to listen to socket on port %i\n", __func__, port);
+        return -1;
+    }
+    return sockfd;
+}
+
+std::string read_from_port(int sockfd, int clientfd) {
+    if (clientfd < 0) {
+        fprintf(stderr, "%s: Failed to accept new connection\n", __func__);
+        return "";
+    }
+
+    char buffer[4096];
+    std::memset(buffer, 0, sizeof(buffer));
+
+    if (read(clientfd, buffer, sizeof(buffer)) < 0) {
+        fprintf(stderr, "%s: Failed to read from client\n", __func__);
+    } else {
+        std::cout << "Received: " << buffer;
+        return std::string(buffer);
+    }
+    return std::string("");
+}
+#endif
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/dolly-v2-3b/ggml-model-f16.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+
+    int64_t t_load_us = 0;
+    int64_t t_sample_us = 0;
+    int64_t t_predict_us = 0;
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+
+    int n_past = 0;
+
+    gpt_vocab vocab;
+    dollyv2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!dollyv2_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+#if defined(DOLLY_INTERACTIVE_PORT)
+    int sockfd = -1;
+    if (params.interactive_port != -1) {
+        sockfd = setup_port(params.interactive_port);
+        if (sockfd == -1) {
+            return 1;
+        }
+        fprintf(stdout, "Model is ready on port %i\n", params.interactive_port);
+        fflush(stdout);
+    }
+#endif
+
+    if (params.interactive || params.interactive_port != -1) {
+        while (true) {
+            std::string prompt_input;
+#if defined(DOLLY_INTERACTIVE_PORT)
+            int clientfd = -1;
+            if (params.interactive_port != -1) {
+                sockaddr_in clientaddr;
+                socklen_t clientaddrlen = sizeof(clientaddr);
+                clientfd = accept(sockfd, (struct sockaddr *)&clientaddr, &clientaddrlen);
+                prompt_input = read_from_port(sockfd, clientfd);
+            } else
+#endif
+            {
+                printf("Please enter your quesiton:\n>");
+                fflush(stdout);
+
+                std::getline(std::cin, prompt_input);
+            }
+
+            if (strcmp(prompt_input.c_str(), "exit") == 0) {
+                break;
+            }
+
+            const std::string prompt = prompt_for_generation(prompt_input);
+            // call the model
+            const std::string response = execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
+
+#if defined(DOLLY_INTERACTIVE_PORT)
+            if (params.interactive_port != -1) {
+                if (write(clientfd, response.c_str(), response.size()) < 0) {
+                    fprintf(stderr, "%s: Failed to write answer '%s' to client\n", __func__, response.c_str());
+                }
+
+                if (close(clientfd) < 0) {
+                    fprintf(stderr, "%s: Failed to close client socket\n", __func__);
+                }
+            } else
+#endif
+            {
+                printf("%s\n\n", response.c_str());
+            }
+            fflush(stdout);
+        }
+    } else {
+        if (params.prompt.empty()) {
+            params.prompt = gpt_random_prompt(rng);
+        }
+
+        const std::string prompt = prompt_for_generation(params.prompt);
+        execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+#if defined(DOLLY_INTERACTIVE_PORT)
+    if (params.interactive_port != -1 && close(sockfd) < 0) {
+        fprintf(stderr, "%s: Failed to close server socket\n", __func__);
+    }
+#endif
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/dolly-v2/quantize.cpp b/stable-diffusion.cpp/ggml/examples/dolly-v2/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c0d24ccf0f035a0853c3de5153f857ebeb8282d
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/dolly-v2/quantize.cpp
@@ -0,0 +1,178 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (dollyv2 3B)
+struct dollyv2_hparams {
+    int32_t n_vocab = 50254; // tokenizer.vocab_size
+    int32_t n_ctx   = 2048;  // model.config.max_position_embeddings
+    int32_t n_embd  = 2560;  // model.config.hidden_size
+    int32_t n_head  = 32;    // model.config.num_attention_heads
+    int32_t n_layer = 32;    // model.config.num_hidden_layers
+    int32_t n_rot   = 20;    // rotary_pct[25%] * (n_embd / n_head)
+    int32_t par_res = 1; // 1 = true, 0 = false
+    int32_t ftype   = GGML_FTYPE_MOSTLY_F16;
+};
+
+// quantize a model
+bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    dollyv2_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = hparams.n_vocab;
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        ".*weight",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./dollyv2-quantize models/dolly-v2-3B/ggml-model.bin models/dolly-v2-3B/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!dollyv2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/dr_wav.h b/stable-diffusion.cpp/ggml/examples/dr_wav.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd3e95b34a0fadcffcc94e01a353339975f18fb7
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/dr_wav.h
@@ -0,0 +1,6434 @@
+/*
+WAV audio loader and writer. Choice of public domain or MIT-0. See license statements at the end of this file.
+dr_wav - v0.12.16 - 2020-12-02
+
+David Reid - mackron@gmail.com
+
+GitHub: https://github.com/mackron/dr_libs
+*/
+
+/*
+RELEASE NOTES - VERSION 0.12
+============================
+Version 0.12 includes breaking changes to custom chunk handling.
+
+
+Changes to Chunk Callback
+-------------------------
+dr_wav supports the ability to fire a callback when a chunk is encounted (except for WAVE and FMT chunks). The callback has been updated to include both the
+container (RIFF or Wave64) and the FMT chunk which contains information about the format of the data in the wave file.
+
+Previously, there was no direct way to determine the container, and therefore no way to discriminate against the different IDs in the chunk header (RIFF and
+Wave64 containers encode chunk ID's differently). The `container` parameter can be used to know which ID to use.
+
+Sometimes it can be useful to know the data format at the time the chunk callback is fired. A pointer to a `drwav_fmt` object is now passed into the chunk
+callback which will give you information about the data format. To determine the sample format, use `drwav_fmt_get_format()`. This will return one of the
+`DR_WAVE_FORMAT_*` tokens.
+*/
+
+/*
+Introduction
+============
+This is a single file library. To use it, do something like the following in one .c file.
+    
+    ```c
+    #define DR_WAV_IMPLEMENTATION
+    #include "dr_wav.h"
+    ```
+
+You can then #include this file in other parts of the program as you would with any other header file. Do something like the following to read audio data:
+
+    ```c
+    drwav wav;
+    if (!drwav_init_file(&wav, "my_song.wav", NULL)) {
+        // Error opening WAV file.
+    }
+
+    drwav_int32* pDecodedInterleavedPCMFrames = malloc(wav.totalPCMFrameCount * wav.channels * sizeof(drwav_int32));
+    size_t numberOfSamplesActuallyDecoded = drwav_read_pcm_frames_s32(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames);
+
+    ...
+
+    drwav_uninit(&wav);
+    ```
+
+If you just want to quickly open and read the audio data in a single operation you can do something like this:
+
+    ```c
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalPCMFrameCount;
+    float* pSampleData = drwav_open_file_and_read_pcm_frames_f32("my_song.wav", &channels, &sampleRate, &totalPCMFrameCount, NULL);
+    if (pSampleData == NULL) {
+        // Error opening and reading WAV file.
+    }
+
+    ...
+
+    drwav_free(pSampleData);
+    ```
+
+The examples above use versions of the API that convert the audio data to a consistent format (32-bit signed PCM, in this case), but you can still output the
+audio data in its internal format (see notes below for supported formats):
+
+    ```c
+    size_t framesRead = drwav_read_pcm_frames(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames);
+    ```
+
+You can also read the raw bytes of audio data, which could be useful if dr_wav does not have native support for a particular data format:
+
+    ```c
+    size_t bytesRead = drwav_read_raw(&wav, bytesToRead, pRawDataBuffer);
+    ```
+
+dr_wav can also be used to output WAV files. This does not currently support compressed formats. To use this, look at `drwav_init_write()`,
+`drwav_init_file_write()`, etc. Use `drwav_write_pcm_frames()` to write samples, or `drwav_write_raw()` to write raw data in the "data" chunk.
+
+    ```c
+    drwav_data_format format;
+    format.container = drwav_container_riff;     // <-- drwav_container_riff = normal WAV files, drwav_container_w64 = Sony Wave64.
+    format.format = DR_WAVE_FORMAT_PCM;          // <-- Any of the DR_WAVE_FORMAT_* codes.
+    format.channels = 2;
+    format.sampleRate = 44100;
+    format.bitsPerSample = 16;
+    drwav_init_file_write(&wav, "data/recording.wav", &format, NULL);
+
+    ...
+
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(pWav, frameCount, pSamples);
+    ```
+
+dr_wav has seamless support the Sony Wave64 format. The decoder will automatically detect it and it should Just Work without any manual intervention.
+
+
+Build Options
+=============
+#define these options before including this file.
+
+#define DR_WAV_NO_CONVERSION_API
+  Disables conversion APIs such as `drwav_read_pcm_frames_f32()` and `drwav_s16_to_f32()`.
+
+#define DR_WAV_NO_STDIO
+  Disables APIs that initialize a decoder from a file such as `drwav_init_file()`, `drwav_init_file_write()`, etc.
+
+
+
+Notes
+=====
+- Samples are always interleaved.
+- The default read function does not do any data conversion. Use `drwav_read_pcm_frames_f32()`, `drwav_read_pcm_frames_s32()` and `drwav_read_pcm_frames_s16()`
+  to read and convert audio data to 32-bit floating point, signed 32-bit integer and signed 16-bit integer samples respectively. Tested and supported internal
+  formats include the following:
+  - Unsigned 8-bit PCM
+  - Signed 12-bit PCM
+  - Signed 16-bit PCM
+  - Signed 24-bit PCM
+  - Signed 32-bit PCM
+  - IEEE 32-bit floating point
+  - IEEE 64-bit floating point
+  - A-law and u-law
+  - Microsoft ADPCM
+  - IMA ADPCM (DVI, format code 0x11)
+- dr_wav will try to read the WAV file as best it can, even if it's not strictly conformant to the WAV format.
+*/
+
+#ifndef dr_wav_h
+#define dr_wav_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DRWAV_STRINGIFY(x)      #x
+#define DRWAV_XSTRINGIFY(x)     DRWAV_STRINGIFY(x)
+
+#define DRWAV_VERSION_MAJOR     0
+#define DRWAV_VERSION_MINOR     12
+#define DRWAV_VERSION_REVISION  16
+#define DRWAV_VERSION_STRING    DRWAV_XSTRINGIFY(DRWAV_VERSION_MAJOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_MINOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_REVISION)
+
+#include <stddef.h> /* For size_t. */
+
+/* Sized types. */
+typedef   signed char           drwav_int8;
+typedef unsigned char           drwav_uint8;
+typedef   signed short          drwav_int16;
+typedef unsigned short          drwav_uint16;
+typedef   signed int            drwav_int32;
+typedef unsigned int            drwav_uint32;
+#if defined(_MSC_VER)
+    typedef   signed __int64    drwav_int64;
+    typedef unsigned __int64    drwav_uint64;
+#else
+    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wlong-long"
+        #if defined(__clang__)
+            #pragma GCC diagnostic ignored "-Wc++11-long-long"
+        #endif
+    #endif
+    typedef   signed long long  drwav_int64;
+    typedef unsigned long long  drwav_uint64;
+    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+        #pragma GCC diagnostic pop
+    #endif
+#endif
+#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__)
+    typedef drwav_uint64        drwav_uintptr;
+#else
+    typedef drwav_uint32        drwav_uintptr;
+#endif
+typedef drwav_uint8             drwav_bool8;
+typedef drwav_uint32            drwav_bool32;
+#define DRWAV_TRUE              1
+#define DRWAV_FALSE             0
+
+#if !defined(DRWAV_API)
+    #if defined(DRWAV_DLL)
+        #if defined(_WIN32)
+            #define DRWAV_DLL_IMPORT  __declspec(dllimport)
+            #define DRWAV_DLL_EXPORT  __declspec(dllexport)
+            #define DRWAV_DLL_PRIVATE static
+        #else
+            #if defined(__GNUC__) && __GNUC__ >= 4
+                #define DRWAV_DLL_IMPORT  __attribute__((visibility("default")))
+                #define DRWAV_DLL_EXPORT  __attribute__((visibility("default")))
+                #define DRWAV_DLL_PRIVATE __attribute__((visibility("hidden")))
+            #else
+                #define DRWAV_DLL_IMPORT
+                #define DRWAV_DLL_EXPORT
+                #define DRWAV_DLL_PRIVATE static
+            #endif
+        #endif
+
+        #if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION)
+            #define DRWAV_API  DRWAV_DLL_EXPORT
+        #else
+            #define DRWAV_API  DRWAV_DLL_IMPORT
+        #endif
+        #define DRWAV_PRIVATE DRWAV_DLL_PRIVATE
+    #else
+        #define DRWAV_API extern
+        #define DRWAV_PRIVATE static
+    #endif
+#endif
+
+typedef drwav_int32 drwav_result;
+#define DRWAV_SUCCESS                        0
+#define DRWAV_ERROR                         -1   /* A generic error. */
+#define DRWAV_INVALID_ARGS                  -2
+#define DRWAV_INVALID_OPERATION             -3
+#define DRWAV_OUT_OF_MEMORY                 -4
+#define DRWAV_OUT_OF_RANGE                  -5
+#define DRWAV_ACCESS_DENIED                 -6
+#define DRWAV_DOES_NOT_EXIST                -7
+#define DRWAV_ALREADY_EXISTS                -8
+#define DRWAV_TOO_MANY_OPEN_FILES           -9
+#define DRWAV_INVALID_FILE                  -10
+#define DRWAV_TOO_BIG                       -11
+#define DRWAV_PATH_TOO_LONG                 -12
+#define DRWAV_NAME_TOO_LONG                 -13
+#define DRWAV_NOT_DIRECTORY                 -14
+#define DRWAV_IS_DIRECTORY                  -15
+#define DRWAV_DIRECTORY_NOT_EMPTY           -16
+#define DRWAV_END_OF_FILE                   -17
+#define DRWAV_NO_SPACE                      -18
+#define DRWAV_BUSY                          -19
+#define DRWAV_IO_ERROR                      -20
+#define DRWAV_INTERRUPT                     -21
+#define DRWAV_UNAVAILABLE                   -22
+#define DRWAV_ALREADY_IN_USE                -23
+#define DRWAV_BAD_ADDRESS                   -24
+#define DRWAV_BAD_SEEK                      -25
+#define DRWAV_BAD_PIPE                      -26
+#define DRWAV_DEADLOCK                      -27
+#define DRWAV_TOO_MANY_LINKS                -28
+#define DRWAV_NOT_IMPLEMENTED               -29
+#define DRWAV_NO_MESSAGE                    -30
+#define DRWAV_BAD_MESSAGE                   -31
+#define DRWAV_NO_DATA_AVAILABLE             -32
+#define DRWAV_INVALID_DATA                  -33
+#define DRWAV_TIMEOUT                       -34
+#define DRWAV_NO_NETWORK                    -35
+#define DRWAV_NOT_UNIQUE                    -36
+#define DRWAV_NOT_SOCKET                    -37
+#define DRWAV_NO_ADDRESS                    -38
+#define DRWAV_BAD_PROTOCOL                  -39
+#define DRWAV_PROTOCOL_UNAVAILABLE          -40
+#define DRWAV_PROTOCOL_NOT_SUPPORTED        -41
+#define DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED -42
+#define DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED  -43
+#define DRWAV_SOCKET_NOT_SUPPORTED          -44
+#define DRWAV_CONNECTION_RESET              -45
+#define DRWAV_ALREADY_CONNECTED             -46
+#define DRWAV_NOT_CONNECTED                 -47
+#define DRWAV_CONNECTION_REFUSED            -48
+#define DRWAV_NO_HOST                       -49
+#define DRWAV_IN_PROGRESS                   -50
+#define DRWAV_CANCELLED                     -51
+#define DRWAV_MEMORY_ALREADY_MAPPED         -52
+#define DRWAV_AT_END                        -53
+
+/* Common data formats. */
+#define DR_WAVE_FORMAT_PCM          0x1
+#define DR_WAVE_FORMAT_ADPCM        0x2
+#define DR_WAVE_FORMAT_IEEE_FLOAT   0x3
+#define DR_WAVE_FORMAT_ALAW         0x6
+#define DR_WAVE_FORMAT_MULAW        0x7
+#define DR_WAVE_FORMAT_DVI_ADPCM    0x11
+#define DR_WAVE_FORMAT_EXTENSIBLE   0xFFFE
+
+/* Constants. */
+#ifndef DRWAV_MAX_SMPL_LOOPS
+#define DRWAV_MAX_SMPL_LOOPS        1
+#endif
+
+/* Flags to pass into drwav_init_ex(), etc. */
+#define DRWAV_SEQUENTIAL            0x00000001
+
+DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision);
+DRWAV_API const char* drwav_version_string(void);
+
+typedef enum
+{
+    drwav_seek_origin_start,
+    drwav_seek_origin_current
+} drwav_seek_origin;
+
+typedef enum
+{
+    drwav_container_riff,
+    drwav_container_w64,
+    drwav_container_rf64
+} drwav_container;
+
+typedef struct
+{
+    union
+    {
+        drwav_uint8 fourcc[4];
+        drwav_uint8 guid[16];
+    } id;
+
+    /* The size in bytes of the chunk. */
+    drwav_uint64 sizeInBytes;
+
+    /*
+    RIFF = 2 byte alignment.
+    W64  = 8 byte alignment.
+    */
+    unsigned int paddingSize;
+} drwav_chunk_header;
+
+typedef struct
+{
+    /*
+    The format tag exactly as specified in the wave file's "fmt" chunk. This can be used by applications
+    that require support for data formats not natively supported by dr_wav.
+    */
+    drwav_uint16 formatTag;
+
+    /* The number of channels making up the audio data. When this is set to 1 it is mono, 2 is stereo, etc. */
+    drwav_uint16 channels;
+
+    /* The sample rate. Usually set to something like 44100. */
+    drwav_uint32 sampleRate;
+
+    /* Average bytes per second. You probably don't need this, but it's left here for informational purposes. */
+    drwav_uint32 avgBytesPerSec;
+
+    /* Block align. This is equal to the number of channels * bytes per sample. */
+    drwav_uint16 blockAlign;
+
+    /* Bits per sample. */
+    drwav_uint16 bitsPerSample;
+
+    /* The size of the extended data. Only used internally for validation, but left here for informational purposes. */
+    drwav_uint16 extendedSize;
+
+    /*
+    The number of valid bits per sample. When <formatTag> is equal to WAVE_FORMAT_EXTENSIBLE, <bitsPerSample>
+    is always rounded up to the nearest multiple of 8. This variable contains information about exactly how
+    many bits are valid per sample. Mainly used for informational purposes.
+    */
+    drwav_uint16 validBitsPerSample;
+
+    /* The channel mask. Not used at the moment. */
+    drwav_uint32 channelMask;
+
+    /* The sub-format, exactly as specified by the wave file. */
+    drwav_uint8 subFormat[16];
+} drwav_fmt;
+
+DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT);
+
+
+/*
+Callback for when data is read. Return value is the number of bytes actually read.
+
+pUserData   [in]  The user data that was passed to drwav_init() and family.
+pBufferOut  [out] The output buffer.
+bytesToRead [in]  The number of bytes to read.
+
+Returns the number of bytes actually read.
+
+A return value of less than bytesToRead indicates the end of the stream. Do _not_ return from this callback until
+either the entire bytesToRead is filled or you have reached the end of the stream.
+*/
+typedef size_t (* drwav_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
+
+/*
+Callback for when data is written. Returns value is the number of bytes actually written.
+
+pUserData    [in]  The user data that was passed to drwav_init_write() and family.
+pData        [out] A pointer to the data to write.
+bytesToWrite [in]  The number of bytes to write.
+
+Returns the number of bytes actually written.
+
+If the return value differs from bytesToWrite, it indicates an error.
+*/
+typedef size_t (* drwav_write_proc)(void* pUserData, const void* pData, size_t bytesToWrite);
+
+/*
+Callback for when data needs to be seeked.
+
+pUserData [in] The user data that was passed to drwav_init() and family.
+offset    [in] The number of bytes to move, relative to the origin. Will never be negative.
+origin    [in] The origin of the seek - the current position or the start of the stream.
+
+Returns whether or not the seek was successful.
+
+Whether or not it is relative to the beginning or current position is determined by the "origin" parameter which will be either drwav_seek_origin_start or
+drwav_seek_origin_current.
+*/
+typedef drwav_bool32 (* drwav_seek_proc)(void* pUserData, int offset, drwav_seek_origin origin);
+
+/*
+Callback for when drwav_init_ex() finds a chunk.
+
+pChunkUserData    [in] The user data that was passed to the pChunkUserData parameter of drwav_init_ex() and family.
+onRead            [in] A pointer to the function to call when reading.
+onSeek            [in] A pointer to the function to call when seeking.
+pReadSeekUserData [in] The user data that was passed to the pReadSeekUserData parameter of drwav_init_ex() and family.
+pChunkHeader      [in] A pointer to an object containing basic header information about the chunk. Use this to identify the chunk.
+container         [in] Whether or not the WAV file is a RIFF or Wave64 container. If you're unsure of the difference, assume RIFF.
+pFMT              [in] A pointer to the object containing the contents of the "fmt" chunk.
+
+Returns the number of bytes read + seeked.
+
+To read data from the chunk, call onRead(), passing in pReadSeekUserData as the first parameter. Do the same for seeking with onSeek(). The return value must
+be the total number of bytes you have read _plus_ seeked.
+
+Use the `container` argument to discriminate the fields in `pChunkHeader->id`. If the container is `drwav_container_riff` or `drwav_container_rf64` you should
+use `id.fourcc`, otherwise you should use `id.guid`.
+
+The `pFMT` parameter can be used to determine the data format of the wave file. Use `drwav_fmt_get_format()` to get the sample format, which will be one of the
+`DR_WAVE_FORMAT_*` identifiers. 
+
+The read pointer will be sitting on the first byte after the chunk's header. You must not attempt to read beyond the boundary of the chunk.
+*/
+typedef drwav_uint64 (* drwav_chunk_proc)(void* pChunkUserData, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_chunk_header* pChunkHeader, drwav_container container, const drwav_fmt* pFMT);
+
+typedef struct
+{
+    void* pUserData;
+    void* (* onMalloc)(size_t sz, void* pUserData);
+    void* (* onRealloc)(void* p, size_t sz, void* pUserData);
+    void  (* onFree)(void* p, void* pUserData);
+} drwav_allocation_callbacks;
+
+/* Structure for internal use. Only used for loaders opened with drwav_init_memory(). */
+typedef struct
+{
+    const drwav_uint8* data;
+    size_t dataSize;
+    size_t currentReadPos;
+} drwav__memory_stream;
+
+/* Structure for internal use. Only used for writers opened with drwav_init_memory_write(). */
+typedef struct
+{
+    void** ppData;
+    size_t* pDataSize;
+    size_t dataSize;
+    size_t dataCapacity;
+    size_t currentWritePos;
+} drwav__memory_stream_write;
+
+typedef struct
+{
+    drwav_container container;  /* RIFF, W64. */
+    drwav_uint32 format;        /* DR_WAVE_FORMAT_* */
+    drwav_uint32 channels;
+    drwav_uint32 sampleRate;
+    drwav_uint32 bitsPerSample;
+} drwav_data_format;
+
+
+/* See the following for details on the 'smpl' chunk: https://sites.google.com/site/musicgapi/technical-documents/wav-file-format#smpl */
+typedef struct
+{
+    drwav_uint32 cuePointId;
+    drwav_uint32 type;
+    drwav_uint32 start;
+    drwav_uint32 end;
+    drwav_uint32 fraction;
+    drwav_uint32 playCount;
+} drwav_smpl_loop;
+
+ typedef struct
+{
+    drwav_uint32 manufacturer;
+    drwav_uint32 product;
+    drwav_uint32 samplePeriod;
+    drwav_uint32 midiUnityNotes;
+    drwav_uint32 midiPitchFraction;
+    drwav_uint32 smpteFormat;
+    drwav_uint32 smpteOffset;
+    drwav_uint32 numSampleLoops;
+    drwav_uint32 samplerData;
+    drwav_smpl_loop loops[DRWAV_MAX_SMPL_LOOPS];
+} drwav_smpl;
+
+typedef struct
+{
+    /* A pointer to the function to call when more data is needed. */
+    drwav_read_proc onRead;
+
+    /* A pointer to the function to call when data needs to be written. Only used when the drwav object is opened in write mode. */
+    drwav_write_proc onWrite;
+
+    /* A pointer to the function to call when the wav file needs to be seeked. */
+    drwav_seek_proc onSeek;
+
+    /* The user data to pass to callbacks. */
+    void* pUserData;
+
+    /* Allocation callbacks. */
+    drwav_allocation_callbacks allocationCallbacks;
+
+
+    /* Whether or not the WAV file is formatted as a standard RIFF file or W64. */
+    drwav_container container;
+
+
+    /* Structure containing format information exactly as specified by the wav file. */
+    drwav_fmt fmt;
+
+    /* The sample rate. Will be set to something like 44100. */
+    drwav_uint32 sampleRate;
+
+    /* The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. */
+    drwav_uint16 channels;
+
+    /* The bits per sample. Will be set to something like 16, 24, etc. */
+    drwav_uint16 bitsPerSample;
+
+    /* Equal to fmt.formatTag, or the value specified by fmt.subFormat if fmt.formatTag is equal to 65534 (WAVE_FORMAT_EXTENSIBLE). */
+    drwav_uint16 translatedFormatTag;
+
+    /* The total number of PCM frames making up the audio data. */
+    drwav_uint64 totalPCMFrameCount;
+
+
+    /* The size in bytes of the data chunk. */
+    drwav_uint64 dataChunkDataSize;
+    
+    /* The position in the stream of the first byte of the data chunk. This is used for seeking. */
+    drwav_uint64 dataChunkDataPos;
+
+    /* The number of bytes remaining in the data chunk. */
+    drwav_uint64 bytesRemaining;
+
+
+    /*
+    Only used in sequential write mode. Keeps track of the desired size of the "data" chunk at the point of initialization time. Always
+    set to 0 for non-sequential writes and when the drwav object is opened in read mode. Used for validation.
+    */
+    drwav_uint64 dataChunkDataSizeTargetWrite;
+
+    /* Keeps track of whether or not the wav writer was initialized in sequential mode. */
+    drwav_bool32 isSequentialWrite;
+
+
+    /* smpl chunk. */
+    drwav_smpl smpl;
+
+
+    /* A hack to avoid a DRWAV_MALLOC() when opening a decoder with drwav_init_memory(). */
+    drwav__memory_stream memoryStream;
+    drwav__memory_stream_write memoryStreamWrite;
+
+    /* Generic data for compressed formats. This data is shared across all block-compressed formats. */
+    struct
+    {
+        drwav_uint64 iCurrentPCMFrame;  /* The index of the next PCM frame that will be read by drwav_read_*(). This is used with "totalPCMFrameCount" to ensure we don't read excess samples at the end of the last block. */
+    } compressed;
+    
+    /* Microsoft ADPCM specific data. */
+    struct
+    {
+        drwav_uint32 bytesRemainingInBlock;
+        drwav_uint16 predictor[2];
+        drwav_int32  delta[2];
+        drwav_int32  cachedFrames[4];  /* Samples are stored in this cache during decoding. */
+        drwav_uint32 cachedFrameCount;
+        drwav_int32  prevFrames[2][2]; /* The previous 2 samples for each channel (2 channels at most). */
+    } msadpcm;
+
+    /* IMA ADPCM specific data. */
+    struct
+    {
+        drwav_uint32 bytesRemainingInBlock;
+        drwav_int32  predictor[2];
+        drwav_int32  stepIndex[2];
+        drwav_int32  cachedFrames[16]; /* Samples are stored in this cache during decoding. */
+        drwav_uint32 cachedFrameCount;
+    } ima;
+} drwav;
+
+
+/*
+Initializes a pre-allocated drwav object for reading.
+
+pWav                         [out]          A pointer to the drwav object being initialized.
+onRead                       [in]           The function to call when data needs to be read from the client.
+onSeek                       [in]           The function to call when the read position of the client data needs to move.
+onChunk                      [in, optional] The function to call when a chunk is enumerated at initialized time.
+pUserData, pReadSeekUserData [in, optional] A pointer to application defined data that will be passed to onRead and onSeek.
+pChunkUserData               [in, optional] A pointer to application defined data that will be passed to onChunk.
+flags                        [in, optional] A set of flags for controlling how things are loaded.
+
+Returns true if successful; false otherwise.
+
+Close the loader with drwav_uninit().
+
+This is the lowest level function for initializing a WAV file. You can also use drwav_init_file() and drwav_init_memory()
+to open the stream from a file or from a block of memory respectively.
+
+Possible values for flags:
+  DRWAV_SEQUENTIAL: Never perform a backwards seek while loading. This disables the chunk callback and will cause this function
+                    to return as soon as the data chunk is found. Any chunks after the data chunk will be ignored.
+
+drwav_init() is equivalent to "drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0);".
+
+The onChunk callback is not called for the WAVE or FMT chunks. The contents of the FMT chunk can be read from pWav->fmt
+after the function returns.
+
+See also: drwav_init_file(), drwav_init_memory(), drwav_uninit()
+*/
+DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Initializes a pre-allocated drwav object for writing.
+
+onWrite   [in]           The function to call when data needs to be written.
+onSeek    [in]           The function to call when the write position needs to move.
+pUserData [in, optional] A pointer to application defined data that will be passed to onWrite and onSeek.
+
+Returns true if successful; false otherwise.
+
+Close the writer with drwav_uninit().
+
+This is the lowest level function for initializing a WAV file. You can also use drwav_init_file_write() and drwav_init_memory_write()
+to open the stream from a file or from a block of memory respectively.
+
+If the total sample count is known, you can use drwav_init_write_sequential(). This avoids the need for dr_wav to perform
+a post-processing step for storing the total sample count and the size of the data chunk which requires a backwards seek.
+
+See also: drwav_init_file_write(), drwav_init_memory_write(), drwav_uninit()
+*/
+DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Utility function to determine the target size of the entire data to be written (including all headers and chunks).
+
+Returns the target size in bytes.
+
+Useful if the application needs to know the size to allocate.
+
+Only writing to the RIFF chunk and one data chunk is currently supported.
+
+See also: drwav_init_write(), drwav_init_file_write(), drwav_init_memory_write()
+*/
+DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalSampleCount);
+
+/*
+Uninitializes the given drwav object.
+
+Use this only for objects initialized with drwav_init*() functions (drwav_init(), drwav_init_ex(), drwav_init_write(), drwav_init_write_sequential()).
+*/
+DRWAV_API drwav_result drwav_uninit(drwav* pWav);
+
+
+/*
+Reads raw audio data.
+
+This is the lowest level function for reading audio data. It simply reads the given number of
+bytes of the raw internal sample data.
+
+Consider using drwav_read_pcm_frames_s16(), drwav_read_pcm_frames_s32() or drwav_read_pcm_frames_f32() for
+reading sample data in a consistent format.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of bytes actually read.
+*/
+DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut);
+
+/*
+Reads up to the specified number of PCM frames from the WAV file.
+
+The output data will be in the file's internal format, converted to native-endian byte order. Use
+drwav_read_pcm_frames_s16/f32/s32() to read data in a specific format.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached or
+you have requested more PCM frames than can possibly fit in the output buffer.
+
+This function will only work when sample data is of a fixed size and uncompressed. If you are
+using a compressed format consider using drwav_read_raw() or drwav_read_pcm_frames_s16/s32/f32().
+
+pBufferOut can be NULL in which case a seek will be performed.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
+
+/*
+Seeks to the given PCM frame.
+
+Returns true if successful; false otherwise.
+*/
+DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex);
+
+
+/*
+Writes raw audio data.
+
+Returns the number of bytes actually written. If this differs from bytesToWrite, it indicates an error.
+*/
+DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData);
+
+/*
+Writes PCM frames.
+
+Returns the number of PCM frames written.
+
+Input samples need to be in native-endian byte order. On big-endian architectures the input data will be converted to
+little-endian. Use drwav_write_raw() to write raw audio data without performing any conversion.
+*/
+DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
+
+
+/* Conversion Utilities */
+#ifndef DR_WAV_NO_CONVERSION_API
+
+/*
+Reads a chunk of audio data and converts it to signed 16-bit PCM samples.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of PCM frames actually read.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
+
+/* Low-level function for converting unsigned 8-bit PCM samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 24-bit PCM samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 32-bit PCM samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 32-bit floating point samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 64-bit floating point samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount);
+
+/* Low-level function for converting A-law samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting u-law samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+
+/*
+Reads a chunk of audio data and converts it to IEEE 32-bit floating point samples.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of PCM frames actually read.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
+
+/* Low-level function for converting unsigned 8-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 16-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 24-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 32-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 64-bit floating point samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount);
+
+/* Low-level function for converting A-law samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting u-law samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+
+/*
+Reads a chunk of audio data and converts it to signed 32-bit PCM samples.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of PCM frames actually read.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
+
+/* Low-level function for converting unsigned 8-bit PCM samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 16-bit PCM samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 24-bit PCM samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 32-bit floating point samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 64-bit floating point samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount);
+
+/* Low-level function for converting A-law samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting u-law samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+#endif  /* DR_WAV_NO_CONVERSION_API */
+
+
+/* High-Level Convenience Helpers */
+
+#ifndef DR_WAV_NO_STDIO
+/*
+Helper for initializing a wave file for reading using stdio.
+
+This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav
+objects because the operating system may restrict the number of file handles an application can have open at
+any given time.
+*/
+DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Helper for initializing a wave file for writing using stdio.
+
+This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav
+objects because the operating system may restrict the number of file handles an application can have open at
+any given time.
+*/
+DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+#endif  /* DR_WAV_NO_STDIO */
+
+/*
+Helper for initializing a loader from a pre-allocated memory buffer.
+
+This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
+the lifetime of the drwav object.
+
+The buffer should contain the contents of the entire wave file, not just the sample data.
+*/
+DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Helper for initializing a writer which outputs data to a memory buffer.
+
+dr_wav will manage the memory allocations, however it is up to the caller to free the data with drwav_free().
+
+The buffer will remain allocated even after drwav_uninit() is called. The buffer should not be considered valid
+until after drwav_uninit() has been called.
+*/
+DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+
+#ifndef DR_WAV_NO_CONVERSION_API
+/*
+Opens and reads an entire wav file in a single operation.
+
+The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
+*/
+DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+#ifndef DR_WAV_NO_STDIO
+/*
+Opens and decodes an entire wav file in a single operation.
+
+The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
+*/
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+#endif
+/*
+Opens and decodes an entire wav file from a block of memory in a single operation.
+
+The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
+*/
+DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+#endif
+
+/* Frees data that was allocated internally by dr_wav. */
+DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/* Converts bytes from a wav stream to a sized type of native endian. */
+DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data);
+DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data);
+DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data);
+DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data);
+DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data);
+DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data);
+
+/* Compares a GUID for the purpose of checking the type of a Wave64 chunk. */
+DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16]);
+
+/* Compares a four-character-code for the purpose of checking the type of a RIFF chunk. */
+DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* dr_wav_h */
+
+
+/************************************************************************************************************************************************************
+ ************************************************************************************************************************************************************
+
+ IMPLEMENTATION
+
+ ************************************************************************************************************************************************************
+ ************************************************************************************************************************************************************/
+#if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION)
+#ifndef dr_wav_c
+#define dr_wav_c
+
+#include <stdlib.h>
+#include <string.h> /* For memcpy(), memset() */
+#include <limits.h> /* For INT_MAX */
+
+#ifndef DR_WAV_NO_STDIO
+#include <stdio.h>
+#include <wchar.h>
+#endif
+
+/* Standard library stuff. */
+#ifndef DRWAV_ASSERT
+#include <assert.h>
+#define DRWAV_ASSERT(expression)           assert(expression)
+#endif
+#ifndef DRWAV_MALLOC
+#define DRWAV_MALLOC(sz)                   malloc((sz))
+#endif
+#ifndef DRWAV_REALLOC
+#define DRWAV_REALLOC(p, sz)               realloc((p), (sz))
+#endif
+#ifndef DRWAV_FREE
+#define DRWAV_FREE(p)                      free((p))
+#endif
+#ifndef DRWAV_COPY_MEMORY
+#define DRWAV_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
+#endif
+#ifndef DRWAV_ZERO_MEMORY
+#define DRWAV_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
+#endif
+#ifndef DRWAV_ZERO_OBJECT
+#define DRWAV_ZERO_OBJECT(p)               DRWAV_ZERO_MEMORY((p), sizeof(*p))
+#endif
+
+#define drwav_countof(x)                   (sizeof(x) / sizeof(x[0]))
+#define drwav_align(x, a)                  ((((x) + (a) - 1) / (a)) * (a))
+#define drwav_min(a, b)                    (((a) < (b)) ? (a) : (b))
+#define drwav_max(a, b)                    (((a) > (b)) ? (a) : (b))
+#define drwav_clamp(x, lo, hi)             (drwav_max((lo), drwav_min((hi), (x))))
+
+#define DRWAV_MAX_SIMD_VECTOR_SIZE         64  /* 64 for AVX-512 in the future. */
+
+/* CPU architecture. */
+#if defined(__x86_64__) || defined(_M_X64)
+    #define DRWAV_X64
+#elif defined(__i386) || defined(_M_IX86)
+    #define DRWAV_X86
+#elif defined(__arm__) || defined(_M_ARM)
+    #define DRWAV_ARM
+#endif
+
+#ifdef _MSC_VER
+    #define DRWAV_INLINE __forceinline
+#elif defined(__GNUC__)
+    /*
+    I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
+    the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
+    case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
+    command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
+    I am using "__inline__" only when we're compiling in strict ANSI mode.
+    */
+    #if defined(__STRICT_ANSI__)
+        #define DRWAV_INLINE __inline__ __attribute__((always_inline))
+    #else
+        #define DRWAV_INLINE inline __attribute__((always_inline))
+    #endif
+#elif defined(__WATCOMC__)
+    #define DRWAV_INLINE __inline
+#else
+    #define DRWAV_INLINE
+#endif
+
+#if defined(SIZE_MAX)
+    #define DRWAV_SIZE_MAX  SIZE_MAX
+#else
+    #if defined(_WIN64) || defined(_LP64) || defined(__LP64__)
+        #define DRWAV_SIZE_MAX  ((drwav_uint64)0xFFFFFFFFFFFFFFFF)
+    #else
+        #define DRWAV_SIZE_MAX  0xFFFFFFFF
+    #endif
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    #define DRWAV_HAS_BYTESWAP16_INTRINSIC
+    #define DRWAV_HAS_BYTESWAP32_INTRINSIC
+    #define DRWAV_HAS_BYTESWAP64_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_bswap16)
+            #define DRWAV_HAS_BYTESWAP16_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap32)
+            #define DRWAV_HAS_BYTESWAP32_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap64)
+            #define DRWAV_HAS_BYTESWAP64_INTRINSIC
+        #endif
+    #endif
+#elif defined(__GNUC__)
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+        #define DRWAV_HAS_BYTESWAP32_INTRINSIC
+        #define DRWAV_HAS_BYTESWAP64_INTRINSIC
+    #endif
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+        #define DRWAV_HAS_BYTESWAP16_INTRINSIC
+    #endif
+#endif
+
+DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision)
+{
+    if (pMajor) {
+        *pMajor = DRWAV_VERSION_MAJOR;
+    }
+
+    if (pMinor) {
+        *pMinor = DRWAV_VERSION_MINOR;
+    }
+
+    if (pRevision) {
+        *pRevision = DRWAV_VERSION_REVISION;
+    }
+}
+
+DRWAV_API const char* drwav_version_string(void)
+{
+    return DRWAV_VERSION_STRING;
+}
+
+/*
+These limits are used for basic validation when initializing the decoder. If you exceed these limits, first of all: what on Earth are
+you doing?! (Let me know, I'd be curious!) Second, you can adjust these by #define-ing them before the dr_wav implementation.
+*/
+#ifndef DRWAV_MAX_SAMPLE_RATE
+#define DRWAV_MAX_SAMPLE_RATE       384000
+#endif
+#ifndef DRWAV_MAX_CHANNELS
+#define DRWAV_MAX_CHANNELS          256
+#endif
+#ifndef DRWAV_MAX_BITS_PER_SAMPLE
+#define DRWAV_MAX_BITS_PER_SAMPLE   64
+#endif
+
+static const drwav_uint8 drwavGUID_W64_RIFF[16] = {0x72,0x69,0x66,0x66, 0x2E,0x91, 0xCF,0x11, 0xA5,0xD6, 0x28,0xDB,0x04,0xC1,0x00,0x00};    /* 66666972-912E-11CF-A5D6-28DB04C10000 */
+static const drwav_uint8 drwavGUID_W64_WAVE[16] = {0x77,0x61,0x76,0x65, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 65766177-ACF3-11D3-8CD1-00C04F8EDB8A */
+/*static const drwav_uint8 drwavGUID_W64_JUNK[16] = {0x6A,0x75,0x6E,0x6B, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};*/    /* 6B6E756A-ACF3-11D3-8CD1-00C04F8EDB8A */
+static const drwav_uint8 drwavGUID_W64_FMT [16] = {0x66,0x6D,0x74,0x20, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 20746D66-ACF3-11D3-8CD1-00C04F8EDB8A */
+static const drwav_uint8 drwavGUID_W64_FACT[16] = {0x66,0x61,0x63,0x74, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 74636166-ACF3-11D3-8CD1-00C04F8EDB8A */
+static const drwav_uint8 drwavGUID_W64_DATA[16] = {0x64,0x61,0x74,0x61, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 61746164-ACF3-11D3-8CD1-00C04F8EDB8A */
+static const drwav_uint8 drwavGUID_W64_SMPL[16] = {0x73,0x6D,0x70,0x6C, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 6C706D73-ACF3-11D3-8CD1-00C04F8EDB8A */
+
+static DRWAV_INLINE drwav_bool32 drwav__guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16])
+{
+    int i;
+    for (i = 0; i < 16; i += 1) {
+        if (a[i] != b[i]) {
+            return DRWAV_FALSE;
+        }
+    }
+
+    return DRWAV_TRUE;
+}
+
+static DRWAV_INLINE drwav_bool32 drwav__fourcc_equal(const drwav_uint8* a, const char* b)
+{
+    return
+        a[0] == b[0] &&
+        a[1] == b[1] &&
+        a[2] == b[2] &&
+        a[3] == b[3];
+}
+
+
+
+static DRWAV_INLINE int drwav__is_little_endian(void)
+{
+#if defined(DRWAV_X86) || defined(DRWAV_X64)
+    return DRWAV_TRUE;
+#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
+    return DRWAV_TRUE;
+#else
+    int n = 1;
+    return (*(char*)&n) == 1;
+#endif
+}
+
+static DRWAV_INLINE drwav_uint16 drwav__bytes_to_u16(const drwav_uint8* data)
+{
+    return (data[0] << 0) | (data[1] << 8);
+}
+
+static DRWAV_INLINE drwav_int16 drwav__bytes_to_s16(const drwav_uint8* data)
+{
+    return (short)drwav__bytes_to_u16(data);
+}
+
+static DRWAV_INLINE drwav_uint32 drwav__bytes_to_u32(const drwav_uint8* data)
+{
+    return (data[0] << 0) | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+static DRWAV_INLINE drwav_int32 drwav__bytes_to_s32(const drwav_uint8* data)
+{
+    return (drwav_int32)drwav__bytes_to_u32(data);
+}
+
+static DRWAV_INLINE drwav_uint64 drwav__bytes_to_u64(const drwav_uint8* data)
+{
+    return
+        ((drwav_uint64)data[0] <<  0) | ((drwav_uint64)data[1] <<  8) | ((drwav_uint64)data[2] << 16) | ((drwav_uint64)data[3] << 24) |
+        ((drwav_uint64)data[4] << 32) | ((drwav_uint64)data[5] << 40) | ((drwav_uint64)data[6] << 48) | ((drwav_uint64)data[7] << 56);
+}
+
+static DRWAV_INLINE drwav_int64 drwav__bytes_to_s64(const drwav_uint8* data)
+{
+    return (drwav_int64)drwav__bytes_to_u64(data);
+}
+
+static DRWAV_INLINE void drwav__bytes_to_guid(const drwav_uint8* data, drwav_uint8* guid)
+{
+    int i;
+    for (i = 0; i < 16; ++i) {
+        guid[i] = data[i];
+    }
+}
+
+
+static DRWAV_INLINE drwav_uint16 drwav__bswap16(drwav_uint16 n)
+{
+#ifdef DRWAV_HAS_BYTESWAP16_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_ushort(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap16(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF00) >> 8) |
+           ((n & 0x00FF) << 8);
+#endif
+}
+
+static DRWAV_INLINE drwav_uint32 drwav__bswap32(drwav_uint32 n)
+{
+#ifdef DRWAV_HAS_BYTESWAP32_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_ulong(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        #if defined(DRWAV_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(DRWAV_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
+            /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
+            drwav_uint32 r;
+            __asm__ __volatile__ (
+            #if defined(DRWAV_64BIT)
+                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
+            #else
+                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
+            #endif
+            );
+            return r;
+        #else
+            return __builtin_bswap32(n);
+        #endif
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+
+static DRWAV_INLINE drwav_uint64 drwav__bswap64(drwav_uint64 n)
+{
+#ifdef DRWAV_HAS_BYTESWAP64_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_uint64(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap64(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
+    return ((n & ((drwav_uint64)0xFF000000 << 32)) >> 56) |
+           ((n & ((drwav_uint64)0x00FF0000 << 32)) >> 40) |
+           ((n & ((drwav_uint64)0x0000FF00 << 32)) >> 24) |
+           ((n & ((drwav_uint64)0x000000FF << 32)) >>  8) |
+           ((n & ((drwav_uint64)0xFF000000      )) <<  8) |
+           ((n & ((drwav_uint64)0x00FF0000      )) << 24) |
+           ((n & ((drwav_uint64)0x0000FF00      )) << 40) |
+           ((n & ((drwav_uint64)0x000000FF      )) << 56);
+#endif
+}
+
+
+static DRWAV_INLINE drwav_int16 drwav__bswap_s16(drwav_int16 n)
+{
+    return (drwav_int16)drwav__bswap16((drwav_uint16)n);
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_s16(drwav_int16* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_s16(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE void drwav__bswap_s24(drwav_uint8* p)
+{
+    drwav_uint8 t;
+    t = p[0];
+    p[0] = p[2];
+    p[2] = t;
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_s24(drwav_uint8* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        drwav_uint8* pSample = pSamples + (iSample*3);
+        drwav__bswap_s24(pSample);
+    }
+}
+
+
+static DRWAV_INLINE drwav_int32 drwav__bswap_s32(drwav_int32 n)
+{
+    return (drwav_int32)drwav__bswap32((drwav_uint32)n);
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_s32(drwav_int32* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_s32(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE float drwav__bswap_f32(float n)
+{
+    union {
+        drwav_uint32 i;
+        float f;
+    } x;
+    x.f = n;
+    x.i = drwav__bswap32(x.i);
+
+    return x.f;
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_f32(float* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_f32(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE double drwav__bswap_f64(double n)
+{
+    union {
+        drwav_uint64 i;
+        double f;
+    } x;
+    x.f = n;
+    x.i = drwav__bswap64(x.i);
+
+    return x.f;
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_f64(double* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_f64(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE void drwav__bswap_samples_pcm(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample)
+{
+    /* Assumes integer PCM. Floating point PCM is done in drwav__bswap_samples_ieee(). */
+    switch (bytesPerSample)
+    {
+        case 2: /* s16, s12 (loosely packed) */
+        {
+            drwav__bswap_samples_s16((drwav_int16*)pSamples, sampleCount);
+        } break;
+        case 3: /* s24 */
+        {
+            drwav__bswap_samples_s24((drwav_uint8*)pSamples, sampleCount);
+        } break;
+        case 4: /* s32 */
+        {
+            drwav__bswap_samples_s32((drwav_int32*)pSamples, sampleCount);
+        } break;
+        default:
+        {
+            /* Unsupported format. */
+            DRWAV_ASSERT(DRWAV_FALSE);
+        } break;
+    }
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_ieee(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample)
+{
+    switch (bytesPerSample)
+    {
+    #if 0   /* Contributions welcome for f16 support. */
+        case 2: /* f16 */
+        {
+            drwav__bswap_samples_f16((drwav_float16*)pSamples, sampleCount);
+        } break;
+    #endif
+        case 4: /* f32 */
+        {
+            drwav__bswap_samples_f32((float*)pSamples, sampleCount);
+        } break;
+        case 8: /* f64 */
+        {
+            drwav__bswap_samples_f64((double*)pSamples, sampleCount);
+        } break;
+        default:
+        {
+            /* Unsupported format. */
+            DRWAV_ASSERT(DRWAV_FALSE);
+        } break;
+    }
+}
+
+static DRWAV_INLINE void drwav__bswap_samples(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample, drwav_uint16 format)
+{
+    switch (format)
+    {
+        case DR_WAVE_FORMAT_PCM:
+        {
+            drwav__bswap_samples_pcm(pSamples, sampleCount, bytesPerSample);
+        } break;
+
+        case DR_WAVE_FORMAT_IEEE_FLOAT:
+        {
+            drwav__bswap_samples_ieee(pSamples, sampleCount, bytesPerSample);
+        } break;
+
+        case DR_WAVE_FORMAT_ALAW:
+        case DR_WAVE_FORMAT_MULAW:
+        {
+            drwav__bswap_samples_s16((drwav_int16*)pSamples, sampleCount);
+        } break;
+
+        case DR_WAVE_FORMAT_ADPCM:
+        case DR_WAVE_FORMAT_DVI_ADPCM:
+        default:
+        {
+            /* Unsupported format. */
+            DRWAV_ASSERT(DRWAV_FALSE);
+        } break;
+    }
+}
+
+
+static void* drwav__malloc_default(size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return DRWAV_MALLOC(sz);
+}
+
+static void* drwav__realloc_default(void* p, size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return DRWAV_REALLOC(p, sz);
+}
+
+static void drwav__free_default(void* p, void* pUserData)
+{
+    (void)pUserData;
+    DRWAV_FREE(p);
+}
+
+
+static void* drwav__malloc_from_callbacks(size_t sz, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+
+    if (pAllocationCallbacks->onMalloc != NULL) {
+        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
+    }
+
+    /* Try using realloc(). */
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
+    }
+
+    return NULL;
+}
+
+static void* drwav__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
+    }
+
+    /* Try emulating realloc() in terms of malloc()/free(). */
+    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
+        void* p2;
+
+        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
+        if (p2 == NULL) {
+            return NULL;
+        }
+
+        if (p != NULL) {
+            DRWAV_COPY_MEMORY(p2, p, szOld);
+            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+        }
+
+        return p2;
+    }
+
+    return NULL;
+}
+
+static void drwav__free_from_callbacks(void* p, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (p == NULL || pAllocationCallbacks == NULL) {
+        return;
+    }
+
+    if (pAllocationCallbacks->onFree != NULL) {
+        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+    }
+}
+
+
+static drwav_allocation_callbacks drwav_copy_allocation_callbacks_or_defaults(const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        /* Copy. */
+        return *pAllocationCallbacks;
+    } else {
+        /* Defaults. */
+        drwav_allocation_callbacks allocationCallbacks;
+        allocationCallbacks.pUserData = NULL;
+        allocationCallbacks.onMalloc  = drwav__malloc_default;
+        allocationCallbacks.onRealloc = drwav__realloc_default;
+        allocationCallbacks.onFree    = drwav__free_default;
+        return allocationCallbacks;
+    }
+}
+
+
+static DRWAV_INLINE drwav_bool32 drwav__is_compressed_format_tag(drwav_uint16 formatTag)
+{
+    return
+        formatTag == DR_WAVE_FORMAT_ADPCM ||
+        formatTag == DR_WAVE_FORMAT_DVI_ADPCM;
+}
+
+static unsigned int drwav__chunk_padding_size_riff(drwav_uint64 chunkSize)
+{
+    return (unsigned int)(chunkSize % 2);
+}
+
+static unsigned int drwav__chunk_padding_size_w64(drwav_uint64 chunkSize)
+{
+    return (unsigned int)(chunkSize % 8);
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
+static drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
+static drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount);
+
+static drwav_result drwav__read_chunk_header(drwav_read_proc onRead, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_chunk_header* pHeaderOut)
+{
+    if (container == drwav_container_riff || container == drwav_container_rf64) {
+        drwav_uint8 sizeInBytes[4];
+
+        if (onRead(pUserData, pHeaderOut->id.fourcc, 4) != 4) {
+            return DRWAV_AT_END;
+        }
+
+        if (onRead(pUserData, sizeInBytes, 4) != 4) {
+            return DRWAV_INVALID_FILE;
+        }
+
+        pHeaderOut->sizeInBytes = drwav__bytes_to_u32(sizeInBytes);
+        pHeaderOut->paddingSize = drwav__chunk_padding_size_riff(pHeaderOut->sizeInBytes);
+        *pRunningBytesReadOut += 8;
+    } else {
+        drwav_uint8 sizeInBytes[8];
+
+        if (onRead(pUserData, pHeaderOut->id.guid, 16) != 16) {
+            return DRWAV_AT_END;
+        }
+
+        if (onRead(pUserData, sizeInBytes, 8) != 8) {
+            return DRWAV_INVALID_FILE;
+        }
+
+        pHeaderOut->sizeInBytes = drwav__bytes_to_u64(sizeInBytes) - 24;    /* <-- Subtract 24 because w64 includes the size of the header. */
+        pHeaderOut->paddingSize = drwav__chunk_padding_size_w64(pHeaderOut->sizeInBytes);
+        *pRunningBytesReadOut += 24;
+    }
+
+    return DRWAV_SUCCESS;
+}
+
+static drwav_bool32 drwav__seek_forward(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
+{
+    drwav_uint64 bytesRemainingToSeek = offset;
+    while (bytesRemainingToSeek > 0) {
+        if (bytesRemainingToSeek > 0x7FFFFFFF) {
+            if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_current)) {
+                return DRWAV_FALSE;
+            }
+            bytesRemainingToSeek -= 0x7FFFFFFF;
+        } else {
+            if (!onSeek(pUserData, (int)bytesRemainingToSeek, drwav_seek_origin_current)) {
+                return DRWAV_FALSE;
+            }
+            bytesRemainingToSeek = 0;
+        }
+    }
+
+    return DRWAV_TRUE;
+}
+
+static drwav_bool32 drwav__seek_from_start(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
+{
+    if (offset <= 0x7FFFFFFF) {
+        return onSeek(pUserData, (int)offset, drwav_seek_origin_start);
+    }
+
+    /* Larger than 32-bit seek. */
+    if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_start)) {
+        return DRWAV_FALSE;
+    }
+    offset -= 0x7FFFFFFF;
+
+    for (;;) {
+        if (offset <= 0x7FFFFFFF) {
+            return onSeek(pUserData, (int)offset, drwav_seek_origin_current);
+        }
+
+        if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_current)) {
+            return DRWAV_FALSE;
+        }
+        offset -= 0x7FFFFFFF;
+    }
+
+    /* Should never get here. */
+    /*return DRWAV_TRUE; */
+}
+
+
+static drwav_bool32 drwav__read_fmt(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_fmt* fmtOut)
+{
+    drwav_chunk_header header;
+    drwav_uint8 fmt[16];
+
+    if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+
+    /* Skip non-fmt chunks. */
+    while (((container == drwav_container_riff || container == drwav_container_rf64) && !drwav__fourcc_equal(header.id.fourcc, "fmt ")) || (container == drwav_container_w64 && !drwav__guid_equal(header.id.guid, drwavGUID_W64_FMT))) {
+        if (!drwav__seek_forward(onSeek, header.sizeInBytes + header.paddingSize, pUserData)) {
+            return DRWAV_FALSE;
+        }
+        *pRunningBytesReadOut += header.sizeInBytes + header.paddingSize;
+
+        /* Try the next header. */
+        if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) {
+            return DRWAV_FALSE;
+        }
+    }
+
+
+    /* Validation. */
+    if (container == drwav_container_riff || container == drwav_container_rf64) {
+        if (!drwav__fourcc_equal(header.id.fourcc, "fmt ")) {
+            return DRWAV_FALSE;
+        }
+    } else {
+        if (!drwav__guid_equal(header.id.guid, drwavGUID_W64_FMT)) {
+            return DRWAV_FALSE;
+        }
+    }
+
+
+    if (onRead(pUserData, fmt, sizeof(fmt)) != sizeof(fmt)) {
+        return DRWAV_FALSE;
+    }
+    *pRunningBytesReadOut += sizeof(fmt);
+
+    fmtOut->formatTag      = drwav__bytes_to_u16(fmt + 0);
+    fmtOut->channels       = drwav__bytes_to_u16(fmt + 2);
+    fmtOut->sampleRate     = drwav__bytes_to_u32(fmt + 4);
+    fmtOut->avgBytesPerSec = drwav__bytes_to_u32(fmt + 8);
+    fmtOut->blockAlign     = drwav__bytes_to_u16(fmt + 12);
+    fmtOut->bitsPerSample  = drwav__bytes_to_u16(fmt + 14);
+
+    fmtOut->extendedSize       = 0;
+    fmtOut->validBitsPerSample = 0;
+    fmtOut->channelMask        = 0;
+    memset(fmtOut->subFormat, 0, sizeof(fmtOut->subFormat));
+
+    if (header.sizeInBytes > 16) {
+        drwav_uint8 fmt_cbSize[2];
+        int bytesReadSoFar = 0;
+
+        if (onRead(pUserData, fmt_cbSize, sizeof(fmt_cbSize)) != sizeof(fmt_cbSize)) {
+            return DRWAV_FALSE;    /* Expecting more data. */
+        }
+        *pRunningBytesReadOut += sizeof(fmt_cbSize);
+
+        bytesReadSoFar = 18;
+
+        fmtOut->extendedSize = drwav__bytes_to_u16(fmt_cbSize);
+        if (fmtOut->extendedSize > 0) {
+            /* Simple validation. */
+            if (fmtOut->formatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
+                if (fmtOut->extendedSize != 22) {
+                    return DRWAV_FALSE;
+                }
+            }
+
+            if (fmtOut->formatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
+                drwav_uint8 fmtext[22];
+                if (onRead(pUserData, fmtext, fmtOut->extendedSize) != fmtOut->extendedSize) {
+                    return DRWAV_FALSE;    /* Expecting more data. */
+                }
+
+                fmtOut->validBitsPerSample = drwav__bytes_to_u16(fmtext + 0);
+                fmtOut->channelMask        = drwav__bytes_to_u32(fmtext + 2);
+                drwav__bytes_to_guid(fmtext + 6, fmtOut->subFormat);
+            } else {
+                if (!onSeek(pUserData, fmtOut->extendedSize, drwav_seek_origin_current)) {
+                    return DRWAV_FALSE;
+                }
+            }
+            *pRunningBytesReadOut += fmtOut->extendedSize;
+
+            bytesReadSoFar += fmtOut->extendedSize;
+        }
+
+        /* Seek past any leftover bytes. For w64 the leftover will be defined based on the chunk size. */
+        if (!onSeek(pUserData, (int)(header.sizeInBytes - bytesReadSoFar), drwav_seek_origin_current)) {
+            return DRWAV_FALSE;
+        }
+        *pRunningBytesReadOut += (header.sizeInBytes - bytesReadSoFar);
+    }
+
+    if (header.paddingSize > 0) {
+        if (!onSeek(pUserData, header.paddingSize, drwav_seek_origin_current)) {
+            return DRWAV_FALSE;
+        }
+        *pRunningBytesReadOut += header.paddingSize;
+    }
+
+    return DRWAV_TRUE;
+}
+
+
+static size_t drwav__on_read(drwav_read_proc onRead, void* pUserData, void* pBufferOut, size_t bytesToRead, drwav_uint64* pCursor)
+{
+    size_t bytesRead;
+
+    DRWAV_ASSERT(onRead != NULL);
+    DRWAV_ASSERT(pCursor != NULL);
+
+    bytesRead = onRead(pUserData, pBufferOut, bytesToRead);
+    *pCursor += bytesRead;
+    return bytesRead;
+}
+
+#if 0
+static drwav_bool32 drwav__on_seek(drwav_seek_proc onSeek, void* pUserData, int offset, drwav_seek_origin origin, drwav_uint64* pCursor)
+{
+    DRWAV_ASSERT(onSeek != NULL);
+    DRWAV_ASSERT(pCursor != NULL);
+
+    if (!onSeek(pUserData, offset, origin)) {
+        return DRWAV_FALSE;
+    }
+
+    if (origin == drwav_seek_origin_start) {
+        *pCursor = offset;
+    } else {
+        *pCursor += offset;
+    }
+
+    return DRWAV_TRUE;
+}
+#endif
+
+
+
+static drwav_uint32 drwav_get_bytes_per_pcm_frame(drwav* pWav)
+{
+    /*
+    The bytes per frame is a bit ambiguous. It can be either be based on the bits per sample, or the block align. The way I'm doing it here
+    is that if the bits per sample is a multiple of 8, use floor(bitsPerSample*channels/8), otherwise fall back to the block align.
+    */
+    if ((pWav->bitsPerSample & 0x7) == 0) {
+        /* Bits per sample is a multiple of 8. */
+        return (pWav->bitsPerSample * pWav->fmt.channels) >> 3;
+    } else {
+        return pWav->fmt.blockAlign;
+    }
+}
+
+DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT)
+{
+    if (pFMT == NULL) {
+        return 0;
+    }
+
+    if (pFMT->formatTag != DR_WAVE_FORMAT_EXTENSIBLE) {
+        return pFMT->formatTag;
+    } else {
+        return drwav__bytes_to_u16(pFMT->subFormat);    /* Only the first two bytes are required. */
+    }
+}
+
+static drwav_bool32 drwav_preinit(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pWav == NULL || onRead == NULL || onSeek == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav));
+    pWav->onRead    = onRead;
+    pWav->onSeek    = onSeek;
+    pWav->pUserData = pReadSeekUserData;
+    pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
+
+    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
+        return DRWAV_FALSE;    /* Invalid allocation callbacks. */
+    }
+
+    return DRWAV_TRUE;
+}
+
+static drwav_bool32 drwav_init__internal(drwav* pWav, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags)
+{
+    /* This function assumes drwav_preinit() has been called beforehand. */
+
+    drwav_uint64 cursor;    /* <-- Keeps track of the byte position so we can seek to specific locations. */
+    drwav_bool32 sequential;
+    drwav_uint8 riff[4];
+    drwav_fmt fmt;
+    unsigned short translatedFormatTag;
+    drwav_bool32 foundDataChunk;
+    drwav_uint64 dataChunkSize = 0; /* <-- Important! Don't explicitly set this to 0 anywhere else. Calculation of the size of the data chunk is performed in different paths depending on the container. */
+    drwav_uint64 sampleCountFromFactChunk = 0;  /* Same as dataChunkSize - make sure this is the only place this is initialized to 0. */
+    drwav_uint64 chunkSize;
+
+    cursor = 0;
+    sequential = (flags & DRWAV_SEQUENTIAL) != 0;
+
+    /* The first 4 bytes should be the RIFF identifier. */
+    if (drwav__on_read(pWav->onRead, pWav->pUserData, riff, sizeof(riff), &cursor) != sizeof(riff)) {
+        return DRWAV_FALSE;
+    }
+
+    /*
+    The first 4 bytes can be used to identify the container. For RIFF files it will start with "RIFF" and for
+    w64 it will start with "riff".
+    */
+    if (drwav__fourcc_equal(riff, "RIFF")) {
+        pWav->container = drwav_container_riff;
+    } else if (drwav__fourcc_equal(riff, "riff")) {
+        int i;
+        drwav_uint8 riff2[12];
+
+        pWav->container = drwav_container_w64;
+
+        /* Check the rest of the GUID for validity. */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, riff2, sizeof(riff2), &cursor) != sizeof(riff2)) {
+            return DRWAV_FALSE;
+        }
+
+        for (i = 0; i < 12; ++i) {
+            if (riff2[i] != drwavGUID_W64_RIFF[i+4]) {
+                return DRWAV_FALSE;
+            }
+        }
+    } else if (drwav__fourcc_equal(riff, "RF64")) {
+        pWav->container = drwav_container_rf64;
+    } else {
+        return DRWAV_FALSE;   /* Unknown or unsupported container. */
+    }
+
+
+    if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
+        drwav_uint8 chunkSizeBytes[4];
+        drwav_uint8 wave[4];
+
+        /* RIFF/WAVE */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return DRWAV_FALSE;
+        }
+
+        if (pWav->container == drwav_container_riff) {
+            if (drwav__bytes_to_u32(chunkSizeBytes) < 36) {
+                return DRWAV_FALSE;    /* Chunk size should always be at least 36 bytes. */
+            }
+        } else {
+            if (drwav__bytes_to_u32(chunkSizeBytes) != 0xFFFFFFFF) {
+                return DRWAV_FALSE;    /* Chunk size should always be set to -1/0xFFFFFFFF for RF64. The actual size is retrieved later. */
+            }
+        }
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
+            return DRWAV_FALSE;
+        }
+
+        if (!drwav__fourcc_equal(wave, "WAVE")) {
+            return DRWAV_FALSE;    /* Expecting "WAVE". */
+        }
+    } else {
+        drwav_uint8 chunkSizeBytes[8];
+        drwav_uint8 wave[16];
+
+        /* W64 */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return DRWAV_FALSE;
+        }
+
+        if (drwav__bytes_to_u64(chunkSizeBytes) < 80) {
+            return DRWAV_FALSE;
+        }
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
+            return DRWAV_FALSE;
+        }
+
+        if (!drwav__guid_equal(wave, drwavGUID_W64_WAVE)) {
+            return DRWAV_FALSE;
+        }
+    }
+
+
+    /* For RF64, the "ds64" chunk must come next, before the "fmt " chunk. */
+    if (pWav->container == drwav_container_rf64) {
+        drwav_uint8 sizeBytes[8];
+        drwav_uint64 bytesRemainingInChunk;
+        drwav_chunk_header header;
+        drwav_result result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+        if (result != DRWAV_SUCCESS) {
+            return DRWAV_FALSE;
+        }
+
+        if (!drwav__fourcc_equal(header.id.fourcc, "ds64")) {
+            return DRWAV_FALSE; /* Expecting "ds64". */
+        }
+
+        bytesRemainingInChunk = header.sizeInBytes + header.paddingSize;
+
+        /* We don't care about the size of the RIFF chunk - skip it. */
+        if (!drwav__seek_forward(pWav->onSeek, 8, pWav->pUserData)) {
+            return DRWAV_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        cursor += 8;
+
+
+        /* Next 8 bytes is the size of the "data" chunk. */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
+            return DRWAV_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        dataChunkSize = drwav__bytes_to_u64(sizeBytes);
+
+
+        /* Next 8 bytes is the same count which we would usually derived from the FACT chunk if it was available. */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
+            return DRWAV_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        sampleCountFromFactChunk = drwav__bytes_to_u64(sizeBytes);
+
+
+        /* Skip over everything else. */
+        if (!drwav__seek_forward(pWav->onSeek, bytesRemainingInChunk, pWav->pUserData)) {
+            return DRWAV_FALSE;
+        }
+        cursor += bytesRemainingInChunk;
+    }
+
+
+    /* The next bytes should be the "fmt " chunk. */
+    if (!drwav__read_fmt(pWav->onRead, pWav->onSeek, pWav->pUserData, pWav->container, &cursor, &fmt)) {
+        return DRWAV_FALSE;    /* Failed to read the "fmt " chunk. */
+    }
+
+    /* Basic validation. */
+    if ((fmt.sampleRate    == 0 || fmt.sampleRate    > DRWAV_MAX_SAMPLE_RATE)     ||
+        (fmt.channels      == 0 || fmt.channels      > DRWAV_MAX_CHANNELS)        ||
+        (fmt.bitsPerSample == 0 || fmt.bitsPerSample > DRWAV_MAX_BITS_PER_SAMPLE) ||
+        fmt.blockAlign == 0) {
+        return DRWAV_FALSE; /* Probably an invalid WAV file. */
+    }
+
+
+    /* Translate the internal format. */
+    translatedFormatTag = fmt.formatTag;
+    if (translatedFormatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
+        translatedFormatTag = drwav__bytes_to_u16(fmt.subFormat + 0);
+    }
+
+
+    /*
+    We need to enumerate over each chunk for two reasons:
+      1) The "data" chunk may not be the next one
+      2) We may want to report each chunk back to the client
+    
+    In order to correctly report each chunk back to the client we will need to keep looping until the end of the file.
+    */
+    foundDataChunk = DRWAV_FALSE;
+
+    /* The next chunk we care about is the "data" chunk. This is not necessarily the next chunk so we'll need to loop. */
+    for (;;)
+    {
+        drwav_chunk_header header;
+        drwav_result result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+        if (result != DRWAV_SUCCESS) {
+            if (!foundDataChunk) {
+                return DRWAV_FALSE;
+            } else {
+                break;  /* Probably at the end of the file. Get out of the loop. */
+            }
+        }
+
+        /* Tell the client about this chunk. */
+        if (!sequential && onChunk != NULL) {
+            drwav_uint64 callbackBytesRead = onChunk(pChunkUserData, pWav->onRead, pWav->onSeek, pWav->pUserData, &header, pWav->container, &fmt);
+
+            /*
+            dr_wav may need to read the contents of the chunk, so we now need to seek back to the position before
+            we called the callback.
+            */
+            if (callbackBytesRead > 0) {
+                if (!drwav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData)) {
+                    return DRWAV_FALSE;
+                }
+            }
+        }
+        
+
+        if (!foundDataChunk) {
+            pWav->dataChunkDataPos = cursor;
+        }
+
+        chunkSize = header.sizeInBytes;
+        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
+            if (drwav__fourcc_equal(header.id.fourcc, "data")) {
+                foundDataChunk = DRWAV_TRUE;
+                if (pWav->container != drwav_container_rf64) {  /* The data chunk size for RF64 will always be set to 0xFFFFFFFF here. It was set to it's true value earlier. */
+                    dataChunkSize = chunkSize;
+                }
+            }
+        } else {
+            if (drwav__guid_equal(header.id.guid, drwavGUID_W64_DATA)) {
+                foundDataChunk = DRWAV_TRUE;
+                dataChunkSize = chunkSize;
+            }
+        }
+
+        /*
+        If at this point we have found the data chunk and we're running in sequential mode, we need to break out of this loop. The reason for
+        this is that we would otherwise require a backwards seek which sequential mode forbids.
+        */
+        if (foundDataChunk && sequential) {
+            break;
+        }
+
+        /* Optional. Get the total sample count from the FACT chunk. This is useful for compressed formats. */
+        if (pWav->container == drwav_container_riff) {
+            if (drwav__fourcc_equal(header.id.fourcc, "fact")) {
+                drwav_uint32 sampleCount;
+                if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCount, 4, &cursor) != 4) {
+                    return DRWAV_FALSE;
+                }
+                chunkSize -= 4;
+
+                if (!foundDataChunk) {
+                    pWav->dataChunkDataPos = cursor;
+                }
+
+                /*
+                The sample count in the "fact" chunk is either unreliable, or I'm not understanding it properly. For now I am only enabling this
+                for Microsoft ADPCM formats.
+                */
+                if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+                    sampleCountFromFactChunk = sampleCount;
+                } else {
+                    sampleCountFromFactChunk = 0;
+                }
+            }
+        } else if (pWav->container == drwav_container_w64) {
+            if (drwav__guid_equal(header.id.guid, drwavGUID_W64_FACT)) {
+                if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCountFromFactChunk, 8, &cursor) != 8) {
+                    return DRWAV_FALSE;
+                }
+                chunkSize -= 8;
+
+                if (!foundDataChunk) {
+                    pWav->dataChunkDataPos = cursor;
+                }
+            }
+        } else if (pWav->container == drwav_container_rf64) {
+            /* We retrieved the sample count from the ds64 chunk earlier so no need to do that here. */
+        }
+
+        /* "smpl" chunk. */
+        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
+            if (drwav__fourcc_equal(header.id.fourcc, "smpl")) {
+                drwav_uint8 smplHeaderData[36];    /* 36 = size of the smpl header section, not including the loop data. */
+                if (chunkSize >= sizeof(smplHeaderData)) {
+                    drwav_uint64 bytesJustRead = drwav__on_read(pWav->onRead, pWav->pUserData, smplHeaderData, sizeof(smplHeaderData), &cursor);
+                    chunkSize -= bytesJustRead;
+
+                    if (bytesJustRead == sizeof(smplHeaderData)) {
+                        drwav_uint32 iLoop;
+
+                        pWav->smpl.manufacturer      = drwav__bytes_to_u32(smplHeaderData+0);
+                        pWav->smpl.product           = drwav__bytes_to_u32(smplHeaderData+4);
+                        pWav->smpl.samplePeriod      = drwav__bytes_to_u32(smplHeaderData+8);
+                        pWav->smpl.midiUnityNotes    = drwav__bytes_to_u32(smplHeaderData+12);
+                        pWav->smpl.midiPitchFraction = drwav__bytes_to_u32(smplHeaderData+16);
+                        pWav->smpl.smpteFormat       = drwav__bytes_to_u32(smplHeaderData+20);
+                        pWav->smpl.smpteOffset       = drwav__bytes_to_u32(smplHeaderData+24);
+                        pWav->smpl.numSampleLoops    = drwav__bytes_to_u32(smplHeaderData+28);
+                        pWav->smpl.samplerData       = drwav__bytes_to_u32(smplHeaderData+32);
+
+                        for (iLoop = 0; iLoop < pWav->smpl.numSampleLoops && iLoop < drwav_countof(pWav->smpl.loops); ++iLoop) {
+                            drwav_uint8 smplLoopData[24];  /* 24 = size of a loop section in the smpl chunk. */
+                            bytesJustRead = drwav__on_read(pWav->onRead, pWav->pUserData, smplLoopData, sizeof(smplLoopData), &cursor);
+                            chunkSize -= bytesJustRead;
+
+                            if (bytesJustRead == sizeof(smplLoopData)) {
+                                pWav->smpl.loops[iLoop].cuePointId = drwav__bytes_to_u32(smplLoopData+0);
+                                pWav->smpl.loops[iLoop].type       = drwav__bytes_to_u32(smplLoopData+4);
+                                pWav->smpl.loops[iLoop].start      = drwav__bytes_to_u32(smplLoopData+8);
+                                pWav->smpl.loops[iLoop].end        = drwav__bytes_to_u32(smplLoopData+12);
+                                pWav->smpl.loops[iLoop].fraction   = drwav__bytes_to_u32(smplLoopData+16);
+                                pWav->smpl.loops[iLoop].playCount  = drwav__bytes_to_u32(smplLoopData+20);
+                            } else {
+                                break;  /* Break from the smpl loop for loop. */
+                            }
+                        }
+                    }
+                } else {
+                    /* Looks like invalid data. Ignore the chunk. */
+                }
+            }
+        } else {
+            if (drwav__guid_equal(header.id.guid, drwavGUID_W64_SMPL)) {
+                /*
+                This path will be hit when a W64 WAV file contains a smpl chunk. I don't have a sample file to test this path, so a contribution
+                is welcome to add support for this.
+                */
+            }
+        }
+
+        /* Make sure we seek past the padding. */
+        chunkSize += header.paddingSize;
+        if (!drwav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData)) {
+            break;
+        }
+        cursor += chunkSize;
+
+        if (!foundDataChunk) {
+            pWav->dataChunkDataPos = cursor;
+        }
+    }
+
+    /* If we haven't found a data chunk, return an error. */
+    if (!foundDataChunk) {
+        return DRWAV_FALSE;
+    }
+
+    /* We may have moved passed the data chunk. If so we need to move back. If running in sequential mode we can assume we are already sitting on the data chunk. */
+    if (!sequential) {
+        if (!drwav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData)) {
+            return DRWAV_FALSE;
+        }
+        cursor = pWav->dataChunkDataPos;
+    }
+    
+
+    /* At this point we should be sitting on the first byte of the raw audio data. */
+
+    pWav->fmt                 = fmt;
+    pWav->sampleRate          = fmt.sampleRate;
+    pWav->channels            = fmt.channels;
+    pWav->bitsPerSample       = fmt.bitsPerSample;
+    pWav->bytesRemaining      = dataChunkSize;
+    pWav->translatedFormatTag = translatedFormatTag;
+    pWav->dataChunkDataSize   = dataChunkSize;
+
+    if (sampleCountFromFactChunk != 0) {
+        pWav->totalPCMFrameCount = sampleCountFromFactChunk;
+    } else {
+        pWav->totalPCMFrameCount = dataChunkSize / drwav_get_bytes_per_pcm_frame(pWav);
+
+        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+            drwav_uint64 totalBlockHeaderSizeInBytes;
+            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+
+            /* Make sure any trailing partial block is accounted for. */
+            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
+                blockCount += 1;
+            }
+
+            /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */
+            totalBlockHeaderSizeInBytes = blockCount * (6*fmt.channels);
+            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
+        }
+        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+            drwav_uint64 totalBlockHeaderSizeInBytes;
+            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+
+            /* Make sure any trailing partial block is accounted for. */
+            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
+                blockCount += 1;
+            }
+
+            /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */
+            totalBlockHeaderSizeInBytes = blockCount * (4*fmt.channels);
+            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
+
+            /* The header includes a decoded sample for each channel which acts as the initial predictor sample. */
+            pWav->totalPCMFrameCount += blockCount;
+        }
+    }
+
+    /* Some formats only support a certain number of channels. */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        if (pWav->channels > 2) {
+            return DRWAV_FALSE;
+        }
+    }
+
+#ifdef DR_WAV_LIBSNDFILE_COMPAT
+    /*
+    I use libsndfile as a benchmark for testing, however in the version I'm using (from the Windows installer on the libsndfile website),
+    it appears the total sample count libsndfile uses for MS-ADPCM is incorrect. It would seem they are computing the total sample count
+    from the number of blocks, however this results in the inclusion of extra silent samples at the end of the last block. The correct
+    way to know the total sample count is to inspect the "fact" chunk, which should always be present for compressed formats, and should
+    always include the sample count. This little block of code below is only used to emulate the libsndfile logic so I can properly run my
+    correctness tests against libsndfile, and is disabled by default.
+    */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2)) / fmt.channels;  /* x2 because two samples per byte. */
+    }
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels)) / fmt.channels;
+    }
+#endif
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!drwav_preinit(pWav, onRead, onSeek, pReadSeekUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
+}
+
+
+static drwav_uint32 drwav__riff_chunk_size_riff(drwav_uint64 dataChunkSize)
+{
+    drwav_uint64 chunkSize = 4 + 24 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 24 = "fmt " chunk. */
+    if (chunkSize > 0xFFFFFFFFUL) {
+        chunkSize = 0xFFFFFFFFUL;
+    }
+
+    return (drwav_uint32)chunkSize; /* Safe cast due to the clamp above. */
+}
+
+static drwav_uint32 drwav__data_chunk_size_riff(drwav_uint64 dataChunkSize)
+{
+    if (dataChunkSize <= 0xFFFFFFFFUL) {
+        return (drwav_uint32)dataChunkSize;
+    } else {
+        return 0xFFFFFFFFUL;
+    }
+}
+
+static drwav_uint64 drwav__riff_chunk_size_w64(drwav_uint64 dataChunkSize)
+{
+    drwav_uint64 dataSubchunkPaddingSize = drwav__chunk_padding_size_w64(dataChunkSize);
+
+    return 80 + 24 + dataChunkSize + dataSubchunkPaddingSize;   /* +24 because W64 includes the size of the GUID and size fields. */
+}
+
+static drwav_uint64 drwav__data_chunk_size_w64(drwav_uint64 dataChunkSize)
+{
+    return 24 + dataChunkSize;        /* +24 because W64 includes the size of the GUID and size fields. */
+}
+
+static drwav_uint64 drwav__riff_chunk_size_rf64(drwav_uint64 dataChunkSize)
+{
+    drwav_uint64 chunkSize = 4 + 36 + 24 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 36 = "ds64" chunk. 24 = "fmt " chunk. */
+    if (chunkSize > 0xFFFFFFFFUL) {
+        chunkSize = 0xFFFFFFFFUL;
+    }
+
+    return chunkSize;
+}
+
+static drwav_uint64 drwav__data_chunk_size_rf64(drwav_uint64 dataChunkSize)
+{
+    return dataChunkSize;
+}
+
+
+static size_t drwav__write(drwav* pWav, const void* pData, size_t dataSize)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    /* Generic write. Assumes no byte reordering required. */
+    return pWav->onWrite(pWav->pUserData, pData, dataSize);
+}
+
+static size_t drwav__write_u16ne_to_le(drwav* pWav, drwav_uint16 value)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    if (!drwav__is_little_endian()) {
+        value = drwav__bswap16(value);
+    }
+
+    return drwav__write(pWav, &value, 2);
+}
+
+static size_t drwav__write_u32ne_to_le(drwav* pWav, drwav_uint32 value)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    if (!drwav__is_little_endian()) {
+        value = drwav__bswap32(value);
+    }
+
+    return drwav__write(pWav, &value, 4);
+}
+
+static size_t drwav__write_u64ne_to_le(drwav* pWav, drwav_uint64 value)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    if (!drwav__is_little_endian()) {
+        value = drwav__bswap64(value);
+    }
+
+    return drwav__write(pWav, &value, 8);
+}
+
+
+static drwav_bool32 drwav_preinit_write(drwav* pWav, const drwav_data_format* pFormat, drwav_bool32 isSequential, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pWav == NULL || onWrite == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    if (!isSequential && onSeek == NULL) {
+        return DRWAV_FALSE; /* <-- onSeek is required when in non-sequential mode. */
+    }
+
+    /* Not currently supporting compressed formats. Will need to add support for the "fact" chunk before we enable this. */
+    if (pFormat->format == DR_WAVE_FORMAT_EXTENSIBLE) {
+        return DRWAV_FALSE;
+    }
+    if (pFormat->format == DR_WAVE_FORMAT_ADPCM || pFormat->format == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return DRWAV_FALSE;
+    }
+
+    DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav));
+    pWav->onWrite   = onWrite;
+    pWav->onSeek    = onSeek;
+    pWav->pUserData = pUserData;
+    pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
+
+    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
+        return DRWAV_FALSE;    /* Invalid allocation callbacks. */
+    }
+
+    pWav->fmt.formatTag = (drwav_uint16)pFormat->format;
+    pWav->fmt.channels = (drwav_uint16)pFormat->channels;
+    pWav->fmt.sampleRate = pFormat->sampleRate;
+    pWav->fmt.avgBytesPerSec = (drwav_uint32)((pFormat->bitsPerSample * pFormat->sampleRate * pFormat->channels) / 8);
+    pWav->fmt.blockAlign = (drwav_uint16)((pFormat->channels * pFormat->bitsPerSample) / 8);
+    pWav->fmt.bitsPerSample = (drwav_uint16)pFormat->bitsPerSample;
+    pWav->fmt.extendedSize = 0;
+    pWav->isSequentialWrite = isSequential;
+
+    return DRWAV_TRUE;
+}
+
+static drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount)
+{
+    /* The function assumes drwav_preinit_write() was called beforehand. */
+
+    size_t runningPos = 0;
+    drwav_uint64 initialDataChunkSize = 0;
+    drwav_uint64 chunkSizeFMT;
+
+    /*
+    The initial values for the "RIFF" and "data" chunks depends on whether or not we are initializing in sequential mode or not. In
+    sequential mode we set this to its final values straight away since they can be calculated from the total sample count. In non-
+    sequential mode we initialize it all to zero and fill it out in drwav_uninit() using a backwards seek.
+    */
+    if (pWav->isSequentialWrite) {
+        initialDataChunkSize = (totalSampleCount * pWav->fmt.bitsPerSample) / 8;
+
+        /*
+        The RIFF container has a limit on the number of samples. drwav is not allowing this. There's no practical limits for Wave64
+        so for the sake of simplicity I'm not doing any validation for that.
+        */
+        if (pFormat->container == drwav_container_riff) {
+            if (initialDataChunkSize > (0xFFFFFFFFUL - 36)) {
+                return DRWAV_FALSE; /* Not enough room to store every sample. */
+            }
+        }
+    }
+
+    pWav->dataChunkDataSizeTargetWrite = initialDataChunkSize;
+
+
+    /* "RIFF" chunk. */
+    if (pFormat->container == drwav_container_riff) {
+        drwav_uint32 chunkSizeRIFF = 28 + (drwav_uint32)initialDataChunkSize;   /* +28 = "WAVE" + [sizeof "fmt " chunk] */
+        runningPos += drwav__write(pWav, "RIFF", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeRIFF);
+        runningPos += drwav__write(pWav, "WAVE", 4);
+    } else if (pFormat->container == drwav_container_w64) {
+        drwav_uint64 chunkSizeRIFF = 80 + 24 + initialDataChunkSize;            /* +24 because W64 includes the size of the GUID and size fields. */
+        runningPos += drwav__write(pWav, drwavGUID_W64_RIFF, 16);
+        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeRIFF);
+        runningPos += drwav__write(pWav, drwavGUID_W64_WAVE, 16);
+    } else if (pFormat->container == drwav_container_rf64) {
+        runningPos += drwav__write(pWav, "RF64", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF);               /* Always 0xFFFFFFFF for RF64. Set to a proper value in the "ds64" chunk. */
+        runningPos += drwav__write(pWav, "WAVE", 4);
+    }
+
+    
+    /* "ds64" chunk (RF64 only). */
+    if (pFormat->container == drwav_container_rf64) {
+        drwav_uint32 initialds64ChunkSize = 28;                                 /* 28 = [Size of RIFF (8 bytes)] + [Size of DATA (8 bytes)] + [Sample Count (8 bytes)] + [Table Length (4 bytes)]. Table length always set to 0. */
+        drwav_uint64 initialRiffChunkSize = 8 + initialds64ChunkSize + initialDataChunkSize;    /* +8 for the ds64 header. */
+
+        runningPos += drwav__write(pWav, "ds64", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, initialds64ChunkSize);     /* Size of ds64. */
+        runningPos += drwav__write_u64ne_to_le(pWav, initialRiffChunkSize);     /* Size of RIFF. Set to true value at the end. */
+        runningPos += drwav__write_u64ne_to_le(pWav, initialDataChunkSize);     /* Size of DATA. Set to true value at the end. */
+        runningPos += drwav__write_u64ne_to_le(pWav, totalSampleCount);         /* Sample count. */
+        runningPos += drwav__write_u32ne_to_le(pWav, 0);                        /* Table length. Always set to zero in our case since we're not doing any other chunks than "DATA". */
+    }
+
+
+    /* "fmt " chunk. */
+    if (pFormat->container == drwav_container_riff || pFormat->container == drwav_container_rf64) {
+        chunkSizeFMT = 16;
+        runningPos += drwav__write(pWav, "fmt ", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, (drwav_uint32)chunkSizeFMT);
+    } else if (pFormat->container == drwav_container_w64) {
+        chunkSizeFMT = 40;
+        runningPos += drwav__write(pWav, drwavGUID_W64_FMT, 16);
+        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeFMT);
+    }
+
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.formatTag);
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.channels);
+    runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.sampleRate);
+    runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.avgBytesPerSec);
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.blockAlign);
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.bitsPerSample);
+
+    pWav->dataChunkDataPos = runningPos;
+
+    /* "data" chunk. */
+    if (pFormat->container == drwav_container_riff) {
+        drwav_uint32 chunkSizeDATA = (drwav_uint32)initialDataChunkSize;
+        runningPos += drwav__write(pWav, "data", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeDATA);
+    } else if (pFormat->container == drwav_container_w64) {
+        drwav_uint64 chunkSizeDATA = 24 + initialDataChunkSize;     /* +24 because W64 includes the size of the GUID and size fields. */
+        runningPos += drwav__write(pWav, drwavGUID_W64_DATA, 16);
+        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeDATA);
+    } else if (pFormat->container == drwav_container_rf64) {
+        runningPos += drwav__write(pWav, "data", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF);   /* Always set to 0xFFFFFFFF for RF64. The true size of the data chunk is specified in the ds64 chunk. */
+    }
+
+    /*
+    The runningPos variable is incremented in the section above but is left unused which is causing some static analysis tools to detect it
+    as a dead store. I'm leaving this as-is for safety just in case I want to expand this function later to include other tags and want to
+    keep track of the running position for whatever reason. The line below should silence the static analysis tools.
+    */
+    (void)runningPos;
+
+    /* Set some properties for the client's convenience. */
+    pWav->container = pFormat->container;
+    pWav->channels = (drwav_uint16)pFormat->channels;
+    pWav->sampleRate = pFormat->sampleRate;
+    pWav->bitsPerSample = (drwav_uint16)pFormat->bitsPerSample;
+    pWav->translatedFormatTag = (drwav_uint16)pFormat->format;
+
+    return DRWAV_TRUE;
+}
+
+
+DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!drwav_preinit_write(pWav, pFormat, DRWAV_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_write__internal(pWav, pFormat, 0);               /* DRWAV_FALSE = Not Sequential */
+}
+
+DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!drwav_preinit_write(pWav, pFormat, DRWAV_TRUE, onWrite, NULL, pUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_write__internal(pWav, pFormat, totalSampleCount); /* DRWAV_TRUE = Sequential */
+}
+
+DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_write_sequential(pWav, pFormat, totalPCMFrameCount*pFormat->channels, onWrite, pUserData, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalSampleCount)
+{
+    /* Casting totalSampleCount to drwav_int64 for VC6 compatibility. No issues in practice because nobody is going to exhaust the whole 63 bits. */
+    drwav_uint64 targetDataSizeBytes = (drwav_uint64)((drwav_int64)totalSampleCount * pFormat->channels * pFormat->bitsPerSample/8.0);
+    drwav_uint64 riffChunkSizeBytes;
+    drwav_uint64 fileSizeBytes = 0;
+
+    if (pFormat->container == drwav_container_riff) {
+        riffChunkSizeBytes = drwav__riff_chunk_size_riff(targetDataSizeBytes);
+        fileSizeBytes = (8 + riffChunkSizeBytes);   /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */
+    } else if (pFormat->container == drwav_container_w64) {
+        riffChunkSizeBytes = drwav__riff_chunk_size_w64(targetDataSizeBytes);
+        fileSizeBytes = riffChunkSizeBytes;
+    } else if (pFormat->container == drwav_container_rf64) {
+        riffChunkSizeBytes = drwav__riff_chunk_size_rf64(targetDataSizeBytes);
+        fileSizeBytes = (8 + riffChunkSizeBytes);   /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */
+    }
+
+    return fileSizeBytes;
+}
+
+
+#ifndef DR_WAV_NO_STDIO
+
+/* drwav_result_from_errno() is only used for fopen() and wfopen() so putting it inside DR_WAV_NO_STDIO for now. If something else needs this later we can move it out. */
+#include <errno.h>
+static drwav_result drwav_result_from_errno(int e)
+{
+    switch (e)
+    {
+        case 0: return DRWAV_SUCCESS;
+    #ifdef EPERM
+        case EPERM: return DRWAV_INVALID_OPERATION;
+    #endif
+    #ifdef ENOENT
+        case ENOENT: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef ESRCH
+        case ESRCH: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef EINTR
+        case EINTR: return DRWAV_INTERRUPT;
+    #endif
+    #ifdef EIO
+        case EIO: return DRWAV_IO_ERROR;
+    #endif
+    #ifdef ENXIO
+        case ENXIO: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef E2BIG
+        case E2BIG: return DRWAV_INVALID_ARGS;
+    #endif
+    #ifdef ENOEXEC
+        case ENOEXEC: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef EBADF
+        case EBADF: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ECHILD
+        case ECHILD: return DRWAV_ERROR;
+    #endif
+    #ifdef EAGAIN
+        case EAGAIN: return DRWAV_UNAVAILABLE;
+    #endif
+    #ifdef ENOMEM
+        case ENOMEM: return DRWAV_OUT_OF_MEMORY;
+    #endif
+    #ifdef EACCES
+        case EACCES: return DRWAV_ACCESS_DENIED;
+    #endif
+    #ifdef EFAULT
+        case EFAULT: return DRWAV_BAD_ADDRESS;
+    #endif
+    #ifdef ENOTBLK
+        case ENOTBLK: return DRWAV_ERROR;
+    #endif
+    #ifdef EBUSY
+        case EBUSY: return DRWAV_BUSY;
+    #endif
+    #ifdef EEXIST
+        case EEXIST: return DRWAV_ALREADY_EXISTS;
+    #endif
+    #ifdef EXDEV
+        case EXDEV: return DRWAV_ERROR;
+    #endif
+    #ifdef ENODEV
+        case ENODEV: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef ENOTDIR
+        case ENOTDIR: return DRWAV_NOT_DIRECTORY;
+    #endif
+    #ifdef EISDIR
+        case EISDIR: return DRWAV_IS_DIRECTORY;
+    #endif
+    #ifdef EINVAL
+        case EINVAL: return DRWAV_INVALID_ARGS;
+    #endif
+    #ifdef ENFILE
+        case ENFILE: return DRWAV_TOO_MANY_OPEN_FILES;
+    #endif
+    #ifdef EMFILE
+        case EMFILE: return DRWAV_TOO_MANY_OPEN_FILES;
+    #endif
+    #ifdef ENOTTY
+        case ENOTTY: return DRWAV_INVALID_OPERATION;
+    #endif
+    #ifdef ETXTBSY
+        case ETXTBSY: return DRWAV_BUSY;
+    #endif
+    #ifdef EFBIG
+        case EFBIG: return DRWAV_TOO_BIG;
+    #endif
+    #ifdef ENOSPC
+        case ENOSPC: return DRWAV_NO_SPACE;
+    #endif
+    #ifdef ESPIPE
+        case ESPIPE: return DRWAV_BAD_SEEK;
+    #endif
+    #ifdef EROFS
+        case EROFS: return DRWAV_ACCESS_DENIED;
+    #endif
+    #ifdef EMLINK
+        case EMLINK: return DRWAV_TOO_MANY_LINKS;
+    #endif
+    #ifdef EPIPE
+        case EPIPE: return DRWAV_BAD_PIPE;
+    #endif
+    #ifdef EDOM
+        case EDOM: return DRWAV_OUT_OF_RANGE;
+    #endif
+    #ifdef ERANGE
+        case ERANGE: return DRWAV_OUT_OF_RANGE;
+    #endif
+    #ifdef EDEADLK
+        case EDEADLK: return DRWAV_DEADLOCK;
+    #endif
+    #ifdef ENAMETOOLONG
+        case ENAMETOOLONG: return DRWAV_PATH_TOO_LONG;
+    #endif
+    #ifdef ENOLCK
+        case ENOLCK: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOSYS
+        case ENOSYS: return DRWAV_NOT_IMPLEMENTED;
+    #endif
+    #ifdef ENOTEMPTY
+        case ENOTEMPTY: return DRWAV_DIRECTORY_NOT_EMPTY;
+    #endif
+    #ifdef ELOOP
+        case ELOOP: return DRWAV_TOO_MANY_LINKS;
+    #endif
+    #ifdef ENOMSG
+        case ENOMSG: return DRWAV_NO_MESSAGE;
+    #endif
+    #ifdef EIDRM
+        case EIDRM: return DRWAV_ERROR;
+    #endif
+    #ifdef ECHRNG
+        case ECHRNG: return DRWAV_ERROR;
+    #endif
+    #ifdef EL2NSYNC
+        case EL2NSYNC: return DRWAV_ERROR;
+    #endif
+    #ifdef EL3HLT
+        case EL3HLT: return DRWAV_ERROR;
+    #endif
+    #ifdef EL3RST
+        case EL3RST: return DRWAV_ERROR;
+    #endif
+    #ifdef ELNRNG
+        case ELNRNG: return DRWAV_OUT_OF_RANGE;
+    #endif
+    #ifdef EUNATCH
+        case EUNATCH: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOCSI
+        case ENOCSI: return DRWAV_ERROR;
+    #endif
+    #ifdef EL2HLT
+        case EL2HLT: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADE
+        case EBADE: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADR
+        case EBADR: return DRWAV_ERROR;
+    #endif
+    #ifdef EXFULL
+        case EXFULL: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOANO
+        case ENOANO: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADRQC
+        case EBADRQC: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADSLT
+        case EBADSLT: return DRWAV_ERROR;
+    #endif
+    #ifdef EBFONT
+        case EBFONT: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ENOSTR
+        case ENOSTR: return DRWAV_ERROR;
+    #endif
+    #ifdef ENODATA
+        case ENODATA: return DRWAV_NO_DATA_AVAILABLE;
+    #endif
+    #ifdef ETIME
+        case ETIME: return DRWAV_TIMEOUT;
+    #endif
+    #ifdef ENOSR
+        case ENOSR: return DRWAV_NO_DATA_AVAILABLE;
+    #endif
+    #ifdef ENONET
+        case ENONET: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ENOPKG
+        case ENOPKG: return DRWAV_ERROR;
+    #endif
+    #ifdef EREMOTE
+        case EREMOTE: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOLINK
+        case ENOLINK: return DRWAV_ERROR;
+    #endif
+    #ifdef EADV
+        case EADV: return DRWAV_ERROR;
+    #endif
+    #ifdef ESRMNT
+        case ESRMNT: return DRWAV_ERROR;
+    #endif
+    #ifdef ECOMM
+        case ECOMM: return DRWAV_ERROR;
+    #endif
+    #ifdef EPROTO
+        case EPROTO: return DRWAV_ERROR;
+    #endif
+    #ifdef EMULTIHOP
+        case EMULTIHOP: return DRWAV_ERROR;
+    #endif
+    #ifdef EDOTDOT
+        case EDOTDOT: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADMSG
+        case EBADMSG: return DRWAV_BAD_MESSAGE;
+    #endif
+    #ifdef EOVERFLOW
+        case EOVERFLOW: return DRWAV_TOO_BIG;
+    #endif
+    #ifdef ENOTUNIQ
+        case ENOTUNIQ: return DRWAV_NOT_UNIQUE;
+    #endif
+    #ifdef EBADFD
+        case EBADFD: return DRWAV_ERROR;
+    #endif
+    #ifdef EREMCHG
+        case EREMCHG: return DRWAV_ERROR;
+    #endif
+    #ifdef ELIBACC
+        case ELIBACC: return DRWAV_ACCESS_DENIED;
+    #endif
+    #ifdef ELIBBAD
+        case ELIBBAD: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ELIBSCN
+        case ELIBSCN: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ELIBMAX
+        case ELIBMAX: return DRWAV_ERROR;
+    #endif
+    #ifdef ELIBEXEC
+        case ELIBEXEC: return DRWAV_ERROR;
+    #endif
+    #ifdef EILSEQ
+        case EILSEQ: return DRWAV_INVALID_DATA;
+    #endif
+    #ifdef ERESTART
+        case ERESTART: return DRWAV_ERROR;
+    #endif
+    #ifdef ESTRPIPE
+        case ESTRPIPE: return DRWAV_ERROR;
+    #endif
+    #ifdef EUSERS
+        case EUSERS: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOTSOCK
+        case ENOTSOCK: return DRWAV_NOT_SOCKET;
+    #endif
+    #ifdef EDESTADDRREQ
+        case EDESTADDRREQ: return DRWAV_NO_ADDRESS;
+    #endif
+    #ifdef EMSGSIZE
+        case EMSGSIZE: return DRWAV_TOO_BIG;
+    #endif
+    #ifdef EPROTOTYPE
+        case EPROTOTYPE: return DRWAV_BAD_PROTOCOL;
+    #endif
+    #ifdef ENOPROTOOPT
+        case ENOPROTOOPT: return DRWAV_PROTOCOL_UNAVAILABLE;
+    #endif
+    #ifdef EPROTONOSUPPORT
+        case EPROTONOSUPPORT: return DRWAV_PROTOCOL_NOT_SUPPORTED;
+    #endif
+    #ifdef ESOCKTNOSUPPORT
+        case ESOCKTNOSUPPORT: return DRWAV_SOCKET_NOT_SUPPORTED;
+    #endif
+    #ifdef EOPNOTSUPP
+        case EOPNOTSUPP: return DRWAV_INVALID_OPERATION;
+    #endif
+    #ifdef EPFNOSUPPORT
+        case EPFNOSUPPORT: return DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED;
+    #endif
+    #ifdef EAFNOSUPPORT
+        case EAFNOSUPPORT: return DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED;
+    #endif
+    #ifdef EADDRINUSE
+        case EADDRINUSE: return DRWAV_ALREADY_IN_USE;
+    #endif
+    #ifdef EADDRNOTAVAIL
+        case EADDRNOTAVAIL: return DRWAV_ERROR;
+    #endif
+    #ifdef ENETDOWN
+        case ENETDOWN: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ENETUNREACH
+        case ENETUNREACH: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ENETRESET
+        case ENETRESET: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ECONNABORTED
+        case ECONNABORTED: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ECONNRESET
+        case ECONNRESET: return DRWAV_CONNECTION_RESET;
+    #endif
+    #ifdef ENOBUFS
+        case ENOBUFS: return DRWAV_NO_SPACE;
+    #endif
+    #ifdef EISCONN
+        case EISCONN: return DRWAV_ALREADY_CONNECTED;
+    #endif
+    #ifdef ENOTCONN
+        case ENOTCONN: return DRWAV_NOT_CONNECTED;
+    #endif
+    #ifdef ESHUTDOWN
+        case ESHUTDOWN: return DRWAV_ERROR;
+    #endif
+    #ifdef ETOOMANYREFS
+        case ETOOMANYREFS: return DRWAV_ERROR;
+    #endif
+    #ifdef ETIMEDOUT
+        case ETIMEDOUT: return DRWAV_TIMEOUT;
+    #endif
+    #ifdef ECONNREFUSED
+        case ECONNREFUSED: return DRWAV_CONNECTION_REFUSED;
+    #endif
+    #ifdef EHOSTDOWN
+        case EHOSTDOWN: return DRWAV_NO_HOST;
+    #endif
+    #ifdef EHOSTUNREACH
+        case EHOSTUNREACH: return DRWAV_NO_HOST;
+    #endif
+    #ifdef EALREADY
+        case EALREADY: return DRWAV_IN_PROGRESS;
+    #endif
+    #ifdef EINPROGRESS
+        case EINPROGRESS: return DRWAV_IN_PROGRESS;
+    #endif
+    #ifdef ESTALE
+        case ESTALE: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef EUCLEAN
+        case EUCLEAN: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOTNAM
+        case ENOTNAM: return DRWAV_ERROR;
+    #endif
+    #ifdef ENAVAIL
+        case ENAVAIL: return DRWAV_ERROR;
+    #endif
+    #ifdef EISNAM
+        case EISNAM: return DRWAV_ERROR;
+    #endif
+    #ifdef EREMOTEIO
+        case EREMOTEIO: return DRWAV_IO_ERROR;
+    #endif
+    #ifdef EDQUOT
+        case EDQUOT: return DRWAV_NO_SPACE;
+    #endif
+    #ifdef ENOMEDIUM
+        case ENOMEDIUM: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef EMEDIUMTYPE
+        case EMEDIUMTYPE: return DRWAV_ERROR;
+    #endif
+    #ifdef ECANCELED
+        case ECANCELED: return DRWAV_CANCELLED;
+    #endif
+    #ifdef ENOKEY
+        case ENOKEY: return DRWAV_ERROR;
+    #endif
+    #ifdef EKEYEXPIRED
+        case EKEYEXPIRED: return DRWAV_ERROR;
+    #endif
+    #ifdef EKEYREVOKED
+        case EKEYREVOKED: return DRWAV_ERROR;
+    #endif
+    #ifdef EKEYREJECTED
+        case EKEYREJECTED: return DRWAV_ERROR;
+    #endif
+    #ifdef EOWNERDEAD
+        case EOWNERDEAD: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOTRECOVERABLE
+        case ENOTRECOVERABLE: return DRWAV_ERROR;
+    #endif
+    #ifdef ERFKILL
+        case ERFKILL: return DRWAV_ERROR;
+    #endif
+    #ifdef EHWPOISON
+        case EHWPOISON: return DRWAV_ERROR;
+    #endif
+        default: return DRWAV_ERROR;
+    }
+}
+
+static drwav_result drwav_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode)
+{
+#if _MSC_VER && _MSC_VER >= 1400
+    errno_t err;
+#endif
+
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+#if _MSC_VER && _MSC_VER >= 1400
+    err = fopen_s(ppFile, pFilePath, pOpenMode);
+    if (err != 0) {
+        return drwav_result_from_errno(err);
+    }
+#else
+#if defined(_WIN32) || defined(__APPLE__)
+    *ppFile = fopen(pFilePath, pOpenMode);
+#else
+    #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE)
+        *ppFile = fopen64(pFilePath, pOpenMode);
+    #else
+        *ppFile = fopen(pFilePath, pOpenMode);
+    #endif
+#endif
+    if (*ppFile == NULL) {
+        drwav_result result = drwav_result_from_errno(errno);
+        if (result == DRWAV_SUCCESS) {
+            result = DRWAV_ERROR;   /* Just a safety check to make sure we never ever return success when pFile == NULL. */
+        }
+
+        return result;
+    }
+#endif
+
+    return DRWAV_SUCCESS;
+}
+
+/*
+_wfopen() isn't always available in all compilation environments.
+
+    * Windows only.
+    * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back).
+    * MinGW-64 (both 32- and 64-bit) seems to support it.
+    * MinGW wraps it in !defined(__STRICT_ANSI__).
+    * OpenWatcom wraps it in !defined(_NO_EXT_KEYS).
+
+This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs()
+fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support.
+*/
+#if defined(_WIN32)
+    #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS))
+        #define DRWAV_HAS_WFOPEN
+    #endif
+#endif
+
+static drwav_result drwav_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+#if defined(DRWAV_HAS_WFOPEN)
+    {
+        /* Use _wfopen() on Windows. */
+    #if defined(_MSC_VER) && _MSC_VER >= 1400
+        errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode);
+        if (err != 0) {
+            return drwav_result_from_errno(err);
+        }
+    #else
+        *ppFile = _wfopen(pFilePath, pOpenMode);
+        if (*ppFile == NULL) {
+            return drwav_result_from_errno(errno);
+        }
+    #endif
+        (void)pAllocationCallbacks;
+    }
+#else
+    /*
+    Use fopen() on anything other than Windows. Requires a conversion. This is annoying because fopen() is locale specific. The only real way I can
+    think of to do this is with wcsrtombs(). Note that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for
+    maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler error I'll look into improving compatibility.
+    */
+    {
+        mbstate_t mbs;
+        size_t lenMB;
+        const wchar_t* pFilePathTemp = pFilePath;
+        char* pFilePathMB = NULL;
+        char pOpenModeMB[32] = {0};
+
+        /* Get the length first. */
+        DRWAV_ZERO_OBJECT(&mbs);
+        lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs);
+        if (lenMB == (size_t)-1) {
+            return drwav_result_from_errno(errno);
+        }
+
+        pFilePathMB = (char*)drwav__malloc_from_callbacks(lenMB + 1, pAllocationCallbacks);
+        if (pFilePathMB == NULL) {
+            return DRWAV_OUT_OF_MEMORY;
+        }
+
+        pFilePathTemp = pFilePath;
+        DRWAV_ZERO_OBJECT(&mbs);
+        wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs);
+
+        /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */
+        {
+            size_t i = 0;
+            for (;;) {
+                if (pOpenMode[i] == 0) {
+                    pOpenModeMB[i] = '\0';
+                    break;
+                }
+
+                pOpenModeMB[i] = (char)pOpenMode[i];
+                i += 1;
+            }
+        }
+
+        *ppFile = fopen(pFilePathMB, pOpenModeMB);
+
+        drwav__free_from_callbacks(pFilePathMB, pAllocationCallbacks);
+    }
+
+    if (*ppFile == NULL) {
+        return DRWAV_ERROR;
+    }
+#endif
+
+    return DRWAV_SUCCESS;
+}
+
+
+static size_t drwav__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+
+static size_t drwav__on_write_stdio(void* pUserData, const void* pData, size_t bytesToWrite)
+{
+    return fwrite(pData, 1, bytesToWrite, (FILE*)pUserData);
+}
+
+static drwav_bool32 drwav__on_seek_stdio(void* pUserData, int offset, drwav_seek_origin origin)
+{
+    return fseek((FILE*)pUserData, offset, (origin == drwav_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
+}
+
+DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_ex(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
+}
+
+
+static drwav_bool32 drwav_init_file__internal_FILE(drwav* pWav, FILE* pFile, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav_bool32 result;
+
+    result = drwav_preinit(pWav, drwav__on_read_stdio, drwav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    result = drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_fopen(&pFile, filename, "rb") != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_ex_w(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
+}
+
+
+static drwav_bool32 drwav_init_file_write__internal_FILE(drwav* pWav, FILE* pFile, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav_bool32 result;
+
+    result = drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_stdio, drwav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    result = drwav_init_write__internal(pWav, pFormat, totalSampleCount);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    return DRWAV_TRUE;
+}
+
+static drwav_bool32 drwav_init_file_write__internal(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_fopen(&pFile, filename, "wb") != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
+}
+
+static drwav_bool32 drwav_init_file_write_w__internal(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_wfopen(&pFile, filename, L"wb", pAllocationCallbacks) != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_file_write_sequential(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write_w__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write_w__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_file_write_sequential_w(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+#endif  /* DR_WAV_NO_STDIO */
+
+
+static size_t drwav__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    drwav* pWav = (drwav*)pUserData;
+    size_t bytesRemaining;
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(pWav->memoryStream.dataSize >= pWav->memoryStream.currentReadPos);
+
+    bytesRemaining = pWav->memoryStream.dataSize - pWav->memoryStream.currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (bytesToRead > 0) {
+        DRWAV_COPY_MEMORY(pBufferOut, pWav->memoryStream.data + pWav->memoryStream.currentReadPos, bytesToRead);
+        pWav->memoryStream.currentReadPos += bytesToRead;
+    }
+
+    return bytesToRead;
+}
+
+static drwav_bool32 drwav__on_seek_memory(void* pUserData, int offset, drwav_seek_origin origin)
+{
+    drwav* pWav = (drwav*)pUserData;
+    DRWAV_ASSERT(pWav != NULL);
+
+    if (origin == drwav_seek_origin_current) {
+        if (offset > 0) {
+            if (pWav->memoryStream.currentReadPos + offset > pWav->memoryStream.dataSize) {
+                return DRWAV_FALSE; /* Trying to seek too far forward. */
+            }
+        } else {
+            if (pWav->memoryStream.currentReadPos < (size_t)-offset) {
+                return DRWAV_FALSE; /* Trying to seek too far backwards. */
+            }
+        }
+
+        /* This will never underflow thanks to the clamps above. */
+        pWav->memoryStream.currentReadPos += offset;
+    } else {
+        if ((drwav_uint32)offset <= pWav->memoryStream.dataSize) {
+            pWav->memoryStream.currentReadPos = offset;
+        } else {
+            return DRWAV_FALSE; /* Trying to seek too far forward. */
+        }
+    }
+    
+    return DRWAV_TRUE;
+}
+
+static size_t drwav__on_write_memory(void* pUserData, const void* pDataIn, size_t bytesToWrite)
+{
+    drwav* pWav = (drwav*)pUserData;
+    size_t bytesRemaining;
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(pWav->memoryStreamWrite.dataCapacity >= pWav->memoryStreamWrite.currentWritePos);
+
+    bytesRemaining = pWav->memoryStreamWrite.dataCapacity - pWav->memoryStreamWrite.currentWritePos;
+    if (bytesRemaining < bytesToWrite) {
+        /* Need to reallocate. */
+        void* pNewData;
+        size_t newDataCapacity = (pWav->memoryStreamWrite.dataCapacity == 0) ? 256 : pWav->memoryStreamWrite.dataCapacity * 2;
+
+        /* If doubling wasn't enough, just make it the minimum required size to write the data. */
+        if ((newDataCapacity - pWav->memoryStreamWrite.currentWritePos) < bytesToWrite) {
+            newDataCapacity = pWav->memoryStreamWrite.currentWritePos + bytesToWrite;
+        }
+
+        pNewData = drwav__realloc_from_callbacks(*pWav->memoryStreamWrite.ppData, newDataCapacity, pWav->memoryStreamWrite.dataCapacity, &pWav->allocationCallbacks);
+        if (pNewData == NULL) {
+            return 0;
+        }
+
+        *pWav->memoryStreamWrite.ppData = pNewData;
+        pWav->memoryStreamWrite.dataCapacity = newDataCapacity;
+    }
+
+    DRWAV_COPY_MEMORY(((drwav_uint8*)(*pWav->memoryStreamWrite.ppData)) + pWav->memoryStreamWrite.currentWritePos, pDataIn, bytesToWrite);
+
+    pWav->memoryStreamWrite.currentWritePos += bytesToWrite;
+    if (pWav->memoryStreamWrite.dataSize < pWav->memoryStreamWrite.currentWritePos) {
+        pWav->memoryStreamWrite.dataSize = pWav->memoryStreamWrite.currentWritePos;
+    }
+
+    *pWav->memoryStreamWrite.pDataSize = pWav->memoryStreamWrite.dataSize;
+
+    return bytesToWrite;
+}
+
+static drwav_bool32 drwav__on_seek_memory_write(void* pUserData, int offset, drwav_seek_origin origin)
+{
+    drwav* pWav = (drwav*)pUserData;
+    DRWAV_ASSERT(pWav != NULL);
+
+    if (origin == drwav_seek_origin_current) {
+        if (offset > 0) {
+            if (pWav->memoryStreamWrite.currentWritePos + offset > pWav->memoryStreamWrite.dataSize) {
+                offset = (int)(pWav->memoryStreamWrite.dataSize - pWav->memoryStreamWrite.currentWritePos);  /* Trying to seek too far forward. */
+            }
+        } else {
+            if (pWav->memoryStreamWrite.currentWritePos < (size_t)-offset) {
+                offset = -(int)pWav->memoryStreamWrite.currentWritePos;  /* Trying to seek too far backwards. */
+            }
+        }
+
+        /* This will never underflow thanks to the clamps above. */
+        pWav->memoryStreamWrite.currentWritePos += offset;
+    } else {
+        if ((drwav_uint32)offset <= pWav->memoryStreamWrite.dataSize) {
+            pWav->memoryStreamWrite.currentWritePos = offset;
+        } else {
+            pWav->memoryStreamWrite.currentWritePos = pWav->memoryStreamWrite.dataSize;  /* Trying to seek too far forward. */
+        }
+    }
+    
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_memory_ex(pWav, data, dataSize, NULL, NULL, 0, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (data == NULL || dataSize == 0) {
+        return DRWAV_FALSE;
+    }
+
+    if (!drwav_preinit(pWav, drwav__on_read_memory, drwav__on_seek_memory, pWav, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    pWav->memoryStream.data = (const drwav_uint8*)data;
+    pWav->memoryStream.dataSize = dataSize;
+    pWav->memoryStream.currentReadPos = 0;
+
+    return drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
+}
+
+
+static drwav_bool32 drwav_init_memory_write__internal(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (ppData == NULL || pDataSize == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    *ppData = NULL; /* Important because we're using realloc()! */
+    *pDataSize = 0;
+
+    if (!drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_memory, drwav__on_seek_memory_write, pWav, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    pWav->memoryStreamWrite.ppData = ppData;
+    pWav->memoryStreamWrite.pDataSize = pDataSize;
+    pWav->memoryStreamWrite.dataSize = 0;
+    pWav->memoryStreamWrite.dataCapacity = 0;
+    pWav->memoryStreamWrite.currentWritePos = 0;
+
+    return drwav_init_write__internal(pWav, pFormat, totalSampleCount);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_memory_write_sequential(pWav, ppData, pDataSize, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+
+
+
+DRWAV_API drwav_result drwav_uninit(drwav* pWav)
+{
+    drwav_result result = DRWAV_SUCCESS;
+
+    if (pWav == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+    /*
+    If the drwav object was opened in write mode we'll need to finalize a few things:
+      - Make sure the "data" chunk is aligned to 16-bits for RIFF containers, or 64 bits for W64 containers.
+      - Set the size of the "data" chunk.
+    */
+    if (pWav->onWrite != NULL) {
+        drwav_uint32 paddingSize = 0;
+
+        /* Padding. Do not adjust pWav->dataChunkDataSize - this should not include the padding. */
+        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
+            paddingSize = drwav__chunk_padding_size_riff(pWav->dataChunkDataSize);
+        } else {
+            paddingSize = drwav__chunk_padding_size_w64(pWav->dataChunkDataSize);
+        }
+        
+        if (paddingSize > 0) {
+            drwav_uint64 paddingData = 0;
+            drwav__write(pWav, &paddingData, paddingSize);  /* Byte order does not matter for this. */
+        }
+
+        /*
+        Chunk sizes. When using sequential mode, these will have been filled in at initialization time. We only need
+        to do this when using non-sequential mode.
+        */
+        if (pWav->onSeek && !pWav->isSequentialWrite) {
+            if (pWav->container == drwav_container_riff) {
+                /* The "RIFF" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, 4, drwav_seek_origin_start)) {
+                    drwav_uint32 riffChunkSize = drwav__riff_chunk_size_riff(pWav->dataChunkDataSize);
+                    drwav__write_u32ne_to_le(pWav, riffChunkSize);
+                }
+
+                /* the "data" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos + 4, drwav_seek_origin_start)) {
+                    drwav_uint32 dataChunkSize = drwav__data_chunk_size_riff(pWav->dataChunkDataSize);
+                    drwav__write_u32ne_to_le(pWav, dataChunkSize);
+                }
+            } else if (pWav->container == drwav_container_w64) {
+                /* The "RIFF" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, 16, drwav_seek_origin_start)) {
+                    drwav_uint64 riffChunkSize = drwav__riff_chunk_size_w64(pWav->dataChunkDataSize);
+                    drwav__write_u64ne_to_le(pWav, riffChunkSize);
+                }
+
+                /* The "data" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos + 16, drwav_seek_origin_start)) {
+                    drwav_uint64 dataChunkSize = drwav__data_chunk_size_w64(pWav->dataChunkDataSize);
+                    drwav__write_u64ne_to_le(pWav, dataChunkSize);
+                }
+            } else if (pWav->container == drwav_container_rf64) {
+                /* We only need to update the ds64 chunk. The "RIFF" and "data" chunks always have their sizes set to 0xFFFFFFFF for RF64. */
+                int ds64BodyPos = 12 + 8;
+
+                /* The "RIFF" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 0, drwav_seek_origin_start)) {
+                    drwav_uint64 riffChunkSize = drwav__riff_chunk_size_rf64(pWav->dataChunkDataSize);
+                    drwav__write_u64ne_to_le(pWav, riffChunkSize);
+                }
+
+                /* The "data" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 8, drwav_seek_origin_start)) {
+                    drwav_uint64 dataChunkSize = drwav__data_chunk_size_rf64(pWav->dataChunkDataSize);
+                    drwav__write_u64ne_to_le(pWav, dataChunkSize);
+                }
+            }
+        }
+
+        /* Validation for sequential mode. */
+        if (pWav->isSequentialWrite) {
+            if (pWav->dataChunkDataSize != pWav->dataChunkDataSizeTargetWrite) {
+                result = DRWAV_INVALID_FILE;
+            }
+        }
+    }
+
+#ifndef DR_WAV_NO_STDIO
+    /*
+    If we opened the file with drwav_open_file() we will want to close the file handle. We can know whether or not drwav_open_file()
+    was used by looking at the onRead and onSeek callbacks.
+    */
+    if (pWav->onRead == drwav__on_read_stdio || pWav->onWrite == drwav__on_write_stdio) {
+        fclose((FILE*)pWav->pUserData);
+    }
+#endif
+
+    return result;
+}
+
+
+
+DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut)
+{
+    size_t bytesRead;
+
+    if (pWav == NULL || bytesToRead == 0) {
+        return 0;
+    }
+
+    if (bytesToRead > pWav->bytesRemaining) {
+        bytesToRead = (size_t)pWav->bytesRemaining;
+    }
+
+    if (pBufferOut != NULL) {
+        bytesRead = pWav->onRead(pWav->pUserData, pBufferOut, bytesToRead);
+    } else {
+        /* We need to seek. If we fail, we need to read-and-discard to make sure we get a good byte count. */
+        bytesRead = 0;
+        while (bytesRead < bytesToRead) {
+            size_t bytesToSeek = (bytesToRead - bytesRead);
+            if (bytesToSeek > 0x7FFFFFFF) {
+                bytesToSeek = 0x7FFFFFFF;
+            }
+
+            if (pWav->onSeek(pWav->pUserData, (int)bytesToSeek, drwav_seek_origin_current) == DRWAV_FALSE) {
+                break;
+            }
+
+            bytesRead += bytesToSeek;
+        }
+
+        /* When we get here we may need to read-and-discard some data. */
+        while (bytesRead < bytesToRead) {
+            drwav_uint8 buffer[4096];
+            size_t bytesSeeked;
+            size_t bytesToSeek = (bytesToRead - bytesRead);
+            if (bytesToSeek > sizeof(buffer)) {
+                bytesToSeek = sizeof(buffer);
+            }
+
+            bytesSeeked = pWav->onRead(pWav->pUserData, buffer, bytesToSeek);
+            bytesRead += bytesSeeked;
+
+            if (bytesSeeked < bytesToSeek) {
+                break;  /* Reached the end. */
+            }
+        }
+    }
+
+    pWav->bytesRemaining -= bytesRead;
+    return bytesRead;
+}
+
+
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
+{
+    drwav_uint32 bytesPerFrame;
+    drwav_uint64 bytesToRead;   /* Intentionally uint64 instead of size_t so we can do a check that we're not reading too much on 32-bit builds. */
+
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    /* Cannot use this function for compressed formats. */
+    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        return 0;
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    bytesToRead = framesToRead * bytesPerFrame;
+    if (bytesToRead > DRWAV_SIZE_MAX) {
+        bytesToRead = (DRWAV_SIZE_MAX / bytesPerFrame) * bytesPerFrame; /* Round the number of bytes to read to a clean frame boundary. */
+    }
+
+    /*
+    Doing an explicit check here just to make it clear that we don't want to be attempt to read anything if there's no bytes to read. There
+    *could* be a time where it evaluates to 0 due to overflowing.
+    */
+    if (bytesToRead == 0) {
+        return 0;
+    }
+
+    return drwav_read_raw(pWav, (size_t)bytesToRead, pBufferOut) / bytesPerFrame;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+
+    if (pBufferOut != NULL) {
+        drwav__bswap_samples(pBufferOut, framesRead*pWav->channels, drwav_get_bytes_per_pcm_frame(pWav)/pWav->channels, pWav->translatedFormatTag);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
+{
+    if (drwav__is_little_endian()) {
+        return drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+    } else {
+        return drwav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
+    }
+}
+
+
+
+DRWAV_API drwav_bool32 drwav_seek_to_first_pcm_frame(drwav* pWav)
+{
+    if (pWav->onWrite != NULL) {
+        return DRWAV_FALSE; /* No seeking in write mode. */
+    }
+
+    if (!pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos, drwav_seek_origin_start)) {
+        return DRWAV_FALSE;
+    }
+
+    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        pWav->compressed.iCurrentPCMFrame = 0;
+
+        /* Cached data needs to be cleared for compressed formats. */
+        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+            DRWAV_ZERO_OBJECT(&pWav->msadpcm);
+        } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+            DRWAV_ZERO_OBJECT(&pWav->ima);
+        } else {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */
+        }
+    }
+    
+    pWav->bytesRemaining = pWav->dataChunkDataSize;
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex)
+{
+    /* Seeking should be compatible with wave files > 2GB. */
+
+    if (pWav == NULL || pWav->onSeek == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    /* No seeking in write mode. */
+    if (pWav->onWrite != NULL) {
+        return DRWAV_FALSE;
+    }
+
+    /* If there are no samples, just return DRWAV_TRUE without doing anything. */
+    if (pWav->totalPCMFrameCount == 0) {
+        return DRWAV_TRUE;
+    }
+
+    /* Make sure the sample is clamped. */
+    if (targetFrameIndex >= pWav->totalPCMFrameCount) {
+        targetFrameIndex  = pWav->totalPCMFrameCount - 1;
+    }
+
+    /*
+    For compressed formats we just use a slow generic seek. If we are seeking forward we just seek forward. If we are going backwards we need
+    to seek back to the start.
+    */
+    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        /* TODO: This can be optimized. */
+        
+        /*
+        If we're seeking forward it's simple - just keep reading samples until we hit the sample we're requesting. If we're seeking backwards,
+        we first need to seek back to the start and then just do the same thing as a forward seek.
+        */
+        if (targetFrameIndex < pWav->compressed.iCurrentPCMFrame) {
+            if (!drwav_seek_to_first_pcm_frame(pWav)) {
+                return DRWAV_FALSE;
+            }
+        }
+
+        if (targetFrameIndex > pWav->compressed.iCurrentPCMFrame) {
+            drwav_uint64 offsetInFrames = targetFrameIndex - pWav->compressed.iCurrentPCMFrame;
+
+            drwav_int16 devnull[2048];
+            while (offsetInFrames > 0) {
+                drwav_uint64 framesRead = 0;
+                drwav_uint64 framesToRead = offsetInFrames;
+                if (framesToRead > drwav_countof(devnull)/pWav->channels) {
+                    framesToRead = drwav_countof(devnull)/pWav->channels;
+                }
+
+                if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+                    framesRead = drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, devnull);
+                } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+                    framesRead = drwav_read_pcm_frames_s16__ima(pWav, framesToRead, devnull);
+                } else {
+                    DRWAV_ASSERT(DRWAV_FALSE);  /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */
+                }
+
+                if (framesRead != framesToRead) {
+                    return DRWAV_FALSE;
+                }
+
+                offsetInFrames -= framesRead;
+            }
+        }
+    } else {
+        drwav_uint64 totalSizeInBytes;
+        drwav_uint64 currentBytePos;
+        drwav_uint64 targetBytePos;
+        drwav_uint64 offset;
+
+        totalSizeInBytes = pWav->totalPCMFrameCount * drwav_get_bytes_per_pcm_frame(pWav);
+        DRWAV_ASSERT(totalSizeInBytes >= pWav->bytesRemaining);
+
+        currentBytePos = totalSizeInBytes - pWav->bytesRemaining;
+        targetBytePos  = targetFrameIndex * drwav_get_bytes_per_pcm_frame(pWav);
+
+        if (currentBytePos < targetBytePos) {
+            /* Offset forwards. */
+            offset = (targetBytePos - currentBytePos);
+        } else {
+            /* Offset backwards. */
+            if (!drwav_seek_to_first_pcm_frame(pWav)) {
+                return DRWAV_FALSE;
+            }
+            offset = targetBytePos;
+        }
+
+        while (offset > 0) {
+            int offset32 = ((offset > INT_MAX) ? INT_MAX : (int)offset);
+            if (!pWav->onSeek(pWav->pUserData, offset32, drwav_seek_origin_current)) {
+                return DRWAV_FALSE;
+            }
+
+            pWav->bytesRemaining -= offset32;
+            offset -= offset32;
+        }
+    }
+
+    return DRWAV_TRUE;
+}
+
+
+DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData)
+{
+    size_t bytesWritten;
+
+    if (pWav == NULL || bytesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+
+    bytesWritten = pWav->onWrite(pWav->pUserData, pData, bytesToWrite);
+    pWav->dataChunkDataSize += bytesWritten;
+
+    return bytesWritten;
+}
+
+
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
+{
+    drwav_uint64 bytesToWrite;
+    drwav_uint64 bytesWritten;
+    const drwav_uint8* pRunningData;
+
+    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+
+    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
+    if (bytesToWrite > DRWAV_SIZE_MAX) {
+        return 0;
+    }
+
+    bytesWritten = 0;
+    pRunningData = (const drwav_uint8*)pData;
+
+    while (bytesToWrite > 0) {
+        size_t bytesJustWritten;
+        drwav_uint64 bytesToWriteThisIteration;
+
+        bytesToWriteThisIteration = bytesToWrite;
+        DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX);  /* <-- This is checked above. */
+
+        bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, pRunningData);
+        if (bytesJustWritten == 0) {
+            break;
+        }
+
+        bytesToWrite -= bytesJustWritten;
+        bytesWritten += bytesJustWritten;
+        pRunningData += bytesJustWritten;
+    }
+
+    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
+}
+
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
+{
+    drwav_uint64 bytesToWrite;
+    drwav_uint64 bytesWritten;
+    drwav_uint32 bytesPerSample;
+    const drwav_uint8* pRunningData;
+
+    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+
+    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
+    if (bytesToWrite > DRWAV_SIZE_MAX) {
+        return 0;
+    }
+
+    bytesWritten = 0;
+    pRunningData = (const drwav_uint8*)pData;
+
+    bytesPerSample = drwav_get_bytes_per_pcm_frame(pWav) / pWav->channels;
+    
+    while (bytesToWrite > 0) {
+        drwav_uint8 temp[4096];
+        drwav_uint32 sampleCount;
+        size_t bytesJustWritten;
+        drwav_uint64 bytesToWriteThisIteration;
+
+        bytesToWriteThisIteration = bytesToWrite;
+        DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX);  /* <-- This is checked above. */
+
+        /*
+        WAV files are always little-endian. We need to byte swap on big-endian architectures. Since our input buffer is read-only we need
+        to use an intermediary buffer for the conversion.
+        */
+        sampleCount = sizeof(temp)/bytesPerSample;
+
+        if (bytesToWriteThisIteration > ((drwav_uint64)sampleCount)*bytesPerSample) {
+            bytesToWriteThisIteration = ((drwav_uint64)sampleCount)*bytesPerSample;
+        }
+
+        DRWAV_COPY_MEMORY(temp, pRunningData, (size_t)bytesToWriteThisIteration);
+        drwav__bswap_samples(temp, sampleCount, bytesPerSample, pWav->translatedFormatTag);
+
+        bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, temp);
+        if (bytesJustWritten == 0) {
+            break;
+        }
+
+        bytesToWrite -= bytesJustWritten;
+        bytesWritten += bytesJustWritten;
+        pRunningData += bytesJustWritten;
+    }
+
+    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
+}
+
+DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
+{
+    if (drwav__is_little_endian()) {
+        return drwav_write_pcm_frames_le(pWav, framesToWrite, pData);
+    } else {
+        return drwav_write_pcm_frames_be(pWav, framesToWrite, pData);
+    }
+}
+
+
+static drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead = 0;
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(framesToRead > 0);
+
+    /* TODO: Lots of room for optimization here. */
+
+    while (framesToRead > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
+        /* If there are no cached frames we need to load a new block. */
+        if (pWav->msadpcm.cachedFrameCount == 0 && pWav->msadpcm.bytesRemainingInBlock == 0) {
+            if (pWav->channels == 1) {
+                /* Mono. */
+                drwav_uint8 header[7];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                pWav->msadpcm.predictor[0]     = header[0];
+                pWav->msadpcm.delta[0]         = drwav__bytes_to_s16(header + 1);
+                pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav__bytes_to_s16(header + 3);
+                pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav__bytes_to_s16(header + 5);
+                pWav->msadpcm.cachedFrames[2]  = pWav->msadpcm.prevFrames[0][0];
+                pWav->msadpcm.cachedFrames[3]  = pWav->msadpcm.prevFrames[0][1];
+                pWav->msadpcm.cachedFrameCount = 2;
+            } else {
+                /* Stereo. */
+                drwav_uint8 header[14];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                pWav->msadpcm.predictor[0] = header[0];
+                pWav->msadpcm.predictor[1] = header[1];
+                pWav->msadpcm.delta[0] = drwav__bytes_to_s16(header + 2);
+                pWav->msadpcm.delta[1] = drwav__bytes_to_s16(header + 4);
+                pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav__bytes_to_s16(header + 6);
+                pWav->msadpcm.prevFrames[1][1] = (drwav_int32)drwav__bytes_to_s16(header + 8);
+                pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav__bytes_to_s16(header + 10);
+                pWav->msadpcm.prevFrames[1][0] = (drwav_int32)drwav__bytes_to_s16(header + 12);
+
+                pWav->msadpcm.cachedFrames[0] = pWav->msadpcm.prevFrames[0][0];
+                pWav->msadpcm.cachedFrames[1] = pWav->msadpcm.prevFrames[1][0];
+                pWav->msadpcm.cachedFrames[2] = pWav->msadpcm.prevFrames[0][1];
+                pWav->msadpcm.cachedFrames[3] = pWav->msadpcm.prevFrames[1][1];
+                pWav->msadpcm.cachedFrameCount = 2;
+            }
+        }
+
+        /* Output anything that's cached. */
+        while (framesToRead > 0 && pWav->msadpcm.cachedFrameCount > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
+            if (pBufferOut != NULL) {
+                drwav_uint32 iSample = 0;
+                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
+                    pBufferOut[iSample] = (drwav_int16)pWav->msadpcm.cachedFrames[(drwav_countof(pWav->msadpcm.cachedFrames) - (pWav->msadpcm.cachedFrameCount*pWav->channels)) + iSample];
+                }
+
+                pBufferOut += pWav->channels;
+            }
+
+            framesToRead    -= 1;
+            totalFramesRead += 1;
+            pWav->compressed.iCurrentPCMFrame += 1;
+            pWav->msadpcm.cachedFrameCount -= 1;
+        }
+
+        if (framesToRead == 0) {
+            return totalFramesRead;
+        }
+
+
+        /*
+        If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next
+        loop iteration which will trigger the loading of a new block.
+        */
+        if (pWav->msadpcm.cachedFrameCount == 0) {
+            if (pWav->msadpcm.bytesRemainingInBlock == 0) {
+                continue;
+            } else {
+                static drwav_int32 adaptationTable[] = { 
+                    230, 230, 230, 230, 307, 409, 512, 614, 
+                    768, 614, 512, 409, 307, 230, 230, 230 
+                };
+                static drwav_int32 coeff1Table[] = { 256, 512, 0, 192, 240, 460,  392 };
+                static drwav_int32 coeff2Table[] = { 0,  -256, 0, 64,  0,  -208, -232 };
+
+                drwav_uint8 nibbles;
+                drwav_int32 nibble0;
+                drwav_int32 nibble1;
+
+                if (pWav->onRead(pWav->pUserData, &nibbles, 1) != 1) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock -= 1;
+
+                /* TODO: Optimize away these if statements. */
+                nibble0 = ((nibbles & 0xF0) >> 4); if ((nibbles & 0x80)) { nibble0 |= 0xFFFFFFF0UL; }
+                nibble1 = ((nibbles & 0x0F) >> 0); if ((nibbles & 0x08)) { nibble1 |= 0xFFFFFFF0UL; }
+
+                if (pWav->channels == 1) {
+                    /* Mono. */
+                    drwav_int32 newSample0;
+                    drwav_int32 newSample1;
+
+                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
+                    newSample0  = drwav_clamp(newSample0, -32768, 32767);
+
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample0;
+
+
+                    newSample1  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample1 += nibble1 * pWav->msadpcm.delta[0];
+                    newSample1  = drwav_clamp(newSample1, -32768, 32767);
+
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample1;
+
+
+                    pWav->msadpcm.cachedFrames[2] = newSample0;
+                    pWav->msadpcm.cachedFrames[3] = newSample1;
+                    pWav->msadpcm.cachedFrameCount = 2;
+                } else {
+                    /* Stereo. */
+                    drwav_int32 newSample0;
+                    drwav_int32 newSample1;
+
+                    /* Left. */
+                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
+                    newSample0  = drwav_clamp(newSample0, -32768, 32767);
+
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample0;
+
+
+                    /* Right. */
+                    newSample1  = ((pWav->msadpcm.prevFrames[1][1] * coeff1Table[pWav->msadpcm.predictor[1]]) + (pWav->msadpcm.prevFrames[1][0] * coeff2Table[pWav->msadpcm.predictor[1]])) >> 8;
+                    newSample1 += nibble1 * pWav->msadpcm.delta[1];
+                    newSample1  = drwav_clamp(newSample1, -32768, 32767);
+
+                    pWav->msadpcm.delta[1] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8;
+                    if (pWav->msadpcm.delta[1] < 16) {
+                        pWav->msadpcm.delta[1] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[1][0] = pWav->msadpcm.prevFrames[1][1];
+                    pWav->msadpcm.prevFrames[1][1] = newSample1;
+
+                    pWav->msadpcm.cachedFrames[2] = newSample0;
+                    pWav->msadpcm.cachedFrames[3] = newSample1;
+                    pWav->msadpcm.cachedFrameCount = 1;
+                }
+            }
+        }
+    }
+
+    return totalFramesRead;
+}
+
+
+static drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead = 0;
+    drwav_uint32 iChannel;
+
+    static drwav_int32 indexTable[16] = {
+        -1, -1, -1, -1, 2, 4, 6, 8,
+        -1, -1, -1, -1, 2, 4, 6, 8
+    };
+
+    static drwav_int32 stepTable[89] = {
+        7,     8,     9,     10,    11,    12,    13,    14,    16,    17, 
+        19,    21,    23,    25,    28,    31,    34,    37,    41,    45, 
+        50,    55,    60,    66,    73,    80,    88,    97,    107,   118, 
+        130,   143,   157,   173,   190,   209,   230,   253,   279,   307,
+        337,   371,   408,   449,   494,   544,   598,   658,   724,   796,
+        876,   963,   1060,  1166,  1282,  1411,  1552,  1707,  1878,  2066, 
+        2272,  2499,  2749,  3024,  3327,  3660,  4026,  4428,  4871,  5358,
+        5894,  6484,  7132,  7845,  8630,  9493,  10442, 11487, 12635, 13899, 
+        15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767 
+    };
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(framesToRead > 0);
+
+    /* TODO: Lots of room for optimization here. */
+
+    while (framesToRead > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
+        /* If there are no cached samples we need to load a new block. */
+        if (pWav->ima.cachedFrameCount == 0 && pWav->ima.bytesRemainingInBlock == 0) {
+            if (pWav->channels == 1) {
+                /* Mono. */
+                drwav_uint8 header[4];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                if (header[2] >= drwav_countof(stepTable)) {
+                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, drwav_seek_origin_current);
+                    pWav->ima.bytesRemainingInBlock = 0;
+                    return totalFramesRead; /* Invalid data. */
+                }
+
+                pWav->ima.predictor[0] = drwav__bytes_to_s16(header + 0);
+                pWav->ima.stepIndex[0] = header[2];
+                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[0];
+                pWav->ima.cachedFrameCount = 1;
+            } else {
+                /* Stereo. */
+                drwav_uint8 header[8];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                if (header[2] >= drwav_countof(stepTable) || header[6] >= drwav_countof(stepTable)) {
+                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, drwav_seek_origin_current);
+                    pWav->ima.bytesRemainingInBlock = 0;
+                    return totalFramesRead; /* Invalid data. */
+                }
+
+                pWav->ima.predictor[0] = drwav__bytes_to_s16(header + 0);
+                pWav->ima.stepIndex[0] = header[2];
+                pWav->ima.predictor[1] = drwav__bytes_to_s16(header + 4);
+                pWav->ima.stepIndex[1] = header[6];
+
+                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 2] = pWav->ima.predictor[0];
+                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[1];
+                pWav->ima.cachedFrameCount = 1;
+            }
+        }
+
+        /* Output anything that's cached. */
+        while (framesToRead > 0 && pWav->ima.cachedFrameCount > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
+            if (pBufferOut != NULL) {
+                drwav_uint32 iSample;
+                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
+                    pBufferOut[iSample] = (drwav_int16)pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + iSample];
+                }
+                pBufferOut += pWav->channels;
+            }
+
+            framesToRead    -= 1;
+            totalFramesRead += 1;
+            pWav->compressed.iCurrentPCMFrame += 1;
+            pWav->ima.cachedFrameCount -= 1;
+        }
+
+        if (framesToRead == 0) {
+            return totalFramesRead;
+        }
+
+        /*
+        If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next
+        loop iteration which will trigger the loading of a new block.
+        */
+        if (pWav->ima.cachedFrameCount == 0) {
+            if (pWav->ima.bytesRemainingInBlock == 0) {
+                continue;
+            } else {
+                /*
+                From what I can tell with stereo streams, it looks like every 4 bytes (8 samples) is for one channel. So it goes 4 bytes for the
+                left channel, 4 bytes for the right channel.
+                */
+                pWav->ima.cachedFrameCount = 8;
+                for (iChannel = 0; iChannel < pWav->channels; ++iChannel) {
+                    drwav_uint32 iByte;
+                    drwav_uint8 nibbles[4];
+                    if (pWav->onRead(pWav->pUserData, &nibbles, 4) != 4) {
+                        pWav->ima.cachedFrameCount = 0;
+                        return totalFramesRead;
+                    }
+                    pWav->ima.bytesRemainingInBlock -= 4;
+
+                    for (iByte = 0; iByte < 4; ++iByte) {
+                        drwav_uint8 nibble0 = ((nibbles[iByte] & 0x0F) >> 0);
+                        drwav_uint8 nibble1 = ((nibbles[iByte] & 0xF0) >> 4);
+
+                        drwav_int32 step      = stepTable[pWav->ima.stepIndex[iChannel]];
+                        drwav_int32 predictor = pWav->ima.predictor[iChannel];
+
+                        drwav_int32      diff  = step >> 3;
+                        if (nibble0 & 1) diff += step >> 2;
+                        if (nibble0 & 2) diff += step >> 1;
+                        if (nibble0 & 4) diff += step;
+                        if (nibble0 & 8) diff  = -diff;
+
+                        predictor = drwav_clamp(predictor + diff, -32768, 32767);
+                        pWav->ima.predictor[iChannel] = predictor;
+                        pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble0], 0, (drwav_int32)drwav_countof(stepTable)-1);
+                        pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+0)*pWav->channels + iChannel] = predictor;
+
+
+                        step      = stepTable[pWav->ima.stepIndex[iChannel]];
+                        predictor = pWav->ima.predictor[iChannel];
+
+                                         diff  = step >> 3;
+                        if (nibble1 & 1) diff += step >> 2;
+                        if (nibble1 & 2) diff += step >> 1;
+                        if (nibble1 & 4) diff += step;
+                        if (nibble1 & 8) diff  = -diff;
+
+                        predictor = drwav_clamp(predictor + diff, -32768, 32767);
+                        pWav->ima.predictor[iChannel] = predictor;
+                        pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble1], 0, (drwav_int32)drwav_countof(stepTable)-1);
+                        pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+1)*pWav->channels + iChannel] = predictor;
+                    }
+                }
+            }
+        }
+    }
+
+    return totalFramesRead;
+}
+
+
+#ifndef DR_WAV_NO_CONVERSION_API
+static unsigned short g_drwavAlawTable[256] = {
+    0xEA80, 0xEB80, 0xE880, 0xE980, 0xEE80, 0xEF80, 0xEC80, 0xED80, 0xE280, 0xE380, 0xE080, 0xE180, 0xE680, 0xE780, 0xE480, 0xE580, 
+    0xF540, 0xF5C0, 0xF440, 0xF4C0, 0xF740, 0xF7C0, 0xF640, 0xF6C0, 0xF140, 0xF1C0, 0xF040, 0xF0C0, 0xF340, 0xF3C0, 0xF240, 0xF2C0, 
+    0xAA00, 0xAE00, 0xA200, 0xA600, 0xBA00, 0xBE00, 0xB200, 0xB600, 0x8A00, 0x8E00, 0x8200, 0x8600, 0x9A00, 0x9E00, 0x9200, 0x9600, 
+    0xD500, 0xD700, 0xD100, 0xD300, 0xDD00, 0xDF00, 0xD900, 0xDB00, 0xC500, 0xC700, 0xC100, 0xC300, 0xCD00, 0xCF00, 0xC900, 0xCB00, 
+    0xFEA8, 0xFEB8, 0xFE88, 0xFE98, 0xFEE8, 0xFEF8, 0xFEC8, 0xFED8, 0xFE28, 0xFE38, 0xFE08, 0xFE18, 0xFE68, 0xFE78, 0xFE48, 0xFE58, 
+    0xFFA8, 0xFFB8, 0xFF88, 0xFF98, 0xFFE8, 0xFFF8, 0xFFC8, 0xFFD8, 0xFF28, 0xFF38, 0xFF08, 0xFF18, 0xFF68, 0xFF78, 0xFF48, 0xFF58, 
+    0xFAA0, 0xFAE0, 0xFA20, 0xFA60, 0xFBA0, 0xFBE0, 0xFB20, 0xFB60, 0xF8A0, 0xF8E0, 0xF820, 0xF860, 0xF9A0, 0xF9E0, 0xF920, 0xF960, 
+    0xFD50, 0xFD70, 0xFD10, 0xFD30, 0xFDD0, 0xFDF0, 0xFD90, 0xFDB0, 0xFC50, 0xFC70, 0xFC10, 0xFC30, 0xFCD0, 0xFCF0, 0xFC90, 0xFCB0, 
+    0x1580, 0x1480, 0x1780, 0x1680, 0x1180, 0x1080, 0x1380, 0x1280, 0x1D80, 0x1C80, 0x1F80, 0x1E80, 0x1980, 0x1880, 0x1B80, 0x1A80, 
+    0x0AC0, 0x0A40, 0x0BC0, 0x0B40, 0x08C0, 0x0840, 0x09C0, 0x0940, 0x0EC0, 0x0E40, 0x0FC0, 0x0F40, 0x0CC0, 0x0C40, 0x0DC0, 0x0D40, 
+    0x5600, 0x5200, 0x5E00, 0x5A00, 0x4600, 0x4200, 0x4E00, 0x4A00, 0x7600, 0x7200, 0x7E00, 0x7A00, 0x6600, 0x6200, 0x6E00, 0x6A00, 
+    0x2B00, 0x2900, 0x2F00, 0x2D00, 0x2300, 0x2100, 0x2700, 0x2500, 0x3B00, 0x3900, 0x3F00, 0x3D00, 0x3300, 0x3100, 0x3700, 0x3500, 
+    0x0158, 0x0148, 0x0178, 0x0168, 0x0118, 0x0108, 0x0138, 0x0128, 0x01D8, 0x01C8, 0x01F8, 0x01E8, 0x0198, 0x0188, 0x01B8, 0x01A8, 
+    0x0058, 0x0048, 0x0078, 0x0068, 0x0018, 0x0008, 0x0038, 0x0028, 0x00D8, 0x00C8, 0x00F8, 0x00E8, 0x0098, 0x0088, 0x00B8, 0x00A8, 
+    0x0560, 0x0520, 0x05E0, 0x05A0, 0x0460, 0x0420, 0x04E0, 0x04A0, 0x0760, 0x0720, 0x07E0, 0x07A0, 0x0660, 0x0620, 0x06E0, 0x06A0, 
+    0x02B0, 0x0290, 0x02F0, 0x02D0, 0x0230, 0x0210, 0x0270, 0x0250, 0x03B0, 0x0390, 0x03F0, 0x03D0, 0x0330, 0x0310, 0x0370, 0x0350
+};
+
+static unsigned short g_drwavMulawTable[256] = {
+    0x8284, 0x8684, 0x8A84, 0x8E84, 0x9284, 0x9684, 0x9A84, 0x9E84, 0xA284, 0xA684, 0xAA84, 0xAE84, 0xB284, 0xB684, 0xBA84, 0xBE84, 
+    0xC184, 0xC384, 0xC584, 0xC784, 0xC984, 0xCB84, 0xCD84, 0xCF84, 0xD184, 0xD384, 0xD584, 0xD784, 0xD984, 0xDB84, 0xDD84, 0xDF84, 
+    0xE104, 0xE204, 0xE304, 0xE404, 0xE504, 0xE604, 0xE704, 0xE804, 0xE904, 0xEA04, 0xEB04, 0xEC04, 0xED04, 0xEE04, 0xEF04, 0xF004, 
+    0xF0C4, 0xF144, 0xF1C4, 0xF244, 0xF2C4, 0xF344, 0xF3C4, 0xF444, 0xF4C4, 0xF544, 0xF5C4, 0xF644, 0xF6C4, 0xF744, 0xF7C4, 0xF844, 
+    0xF8A4, 0xF8E4, 0xF924, 0xF964, 0xF9A4, 0xF9E4, 0xFA24, 0xFA64, 0xFAA4, 0xFAE4, 0xFB24, 0xFB64, 0xFBA4, 0xFBE4, 0xFC24, 0xFC64, 
+    0xFC94, 0xFCB4, 0xFCD4, 0xFCF4, 0xFD14, 0xFD34, 0xFD54, 0xFD74, 0xFD94, 0xFDB4, 0xFDD4, 0xFDF4, 0xFE14, 0xFE34, 0xFE54, 0xFE74, 
+    0xFE8C, 0xFE9C, 0xFEAC, 0xFEBC, 0xFECC, 0xFEDC, 0xFEEC, 0xFEFC, 0xFF0C, 0xFF1C, 0xFF2C, 0xFF3C, 0xFF4C, 0xFF5C, 0xFF6C, 0xFF7C, 
+    0xFF88, 0xFF90, 0xFF98, 0xFFA0, 0xFFA8, 0xFFB0, 0xFFB8, 0xFFC0, 0xFFC8, 0xFFD0, 0xFFD8, 0xFFE0, 0xFFE8, 0xFFF0, 0xFFF8, 0x0000, 
+    0x7D7C, 0x797C, 0x757C, 0x717C, 0x6D7C, 0x697C, 0x657C, 0x617C, 0x5D7C, 0x597C, 0x557C, 0x517C, 0x4D7C, 0x497C, 0x457C, 0x417C, 
+    0x3E7C, 0x3C7C, 0x3A7C, 0x387C, 0x367C, 0x347C, 0x327C, 0x307C, 0x2E7C, 0x2C7C, 0x2A7C, 0x287C, 0x267C, 0x247C, 0x227C, 0x207C, 
+    0x1EFC, 0x1DFC, 0x1CFC, 0x1BFC, 0x1AFC, 0x19FC, 0x18FC, 0x17FC, 0x16FC, 0x15FC, 0x14FC, 0x13FC, 0x12FC, 0x11FC, 0x10FC, 0x0FFC, 
+    0x0F3C, 0x0EBC, 0x0E3C, 0x0DBC, 0x0D3C, 0x0CBC, 0x0C3C, 0x0BBC, 0x0B3C, 0x0ABC, 0x0A3C, 0x09BC, 0x093C, 0x08BC, 0x083C, 0x07BC, 
+    0x075C, 0x071C, 0x06DC, 0x069C, 0x065C, 0x061C, 0x05DC, 0x059C, 0x055C, 0x051C, 0x04DC, 0x049C, 0x045C, 0x041C, 0x03DC, 0x039C, 
+    0x036C, 0x034C, 0x032C, 0x030C, 0x02EC, 0x02CC, 0x02AC, 0x028C, 0x026C, 0x024C, 0x022C, 0x020C, 0x01EC, 0x01CC, 0x01AC, 0x018C, 
+    0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104, 0x00F4, 0x00E4, 0x00D4, 0x00C4, 0x00B4, 0x00A4, 0x0094, 0x0084, 
+    0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040, 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000
+};
+
+static DRWAV_INLINE drwav_int16 drwav__alaw_to_s16(drwav_uint8 sampleIn)
+{
+    return (short)g_drwavAlawTable[sampleIn];
+}
+
+static DRWAV_INLINE drwav_int16 drwav__mulaw_to_s16(drwav_uint8 sampleIn)
+{
+    return (short)g_drwavMulawTable[sampleIn];
+}
+
+
+
+static void drwav__pcm_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    unsigned int i;
+
+    /* Special case for 8-bit sample data because it's treated as unsigned. */
+    if (bytesPerSample == 1) {
+        drwav_u8_to_s16(pOut, pIn, totalSampleCount);
+        return;
+    }
+
+
+    /* Slightly more optimal implementation for common formats. */
+    if (bytesPerSample == 2) {
+        for (i = 0; i < totalSampleCount; ++i) {
+           *pOut++ = ((const drwav_int16*)pIn)[i];
+        }
+        return;
+    }
+    if (bytesPerSample == 3) {
+        drwav_s24_to_s16(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        drwav_s32_to_s16(pOut, (const drwav_int32*)pIn, totalSampleCount);
+        return;
+    }
+
+
+    /* Anything more than 64 bits per sample is not supported. */
+    if (bytesPerSample > 8) {
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+
+
+    /* Generic, slow converter. */
+    for (i = 0; i < totalSampleCount; ++i) {
+        drwav_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            DRWAV_ASSERT(j < 8);
+            sample |= (drwav_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+
+        pIn += j;
+        *pOut++ = (drwav_int16)((drwav_int64)sample >> 48);
+    }
+}
+
+static void drwav__ieee_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        drwav_f32_to_s16(pOut, (const float*)pIn, totalSampleCount);
+        return;
+    } else if (bytesPerSample == 8) {
+        drwav_f64_to_s16(pOut, (const double*)pIn, totalSampleCount);
+        return;
+    } else {
+        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s16__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint32 bytesPerFrame;
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+
+    /* Fast path. */
+    if ((pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 16) || pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+    
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+    
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav__pcm_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s16__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+    drwav_uint32 bytesPerFrame;
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+    
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav__ieee_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s16__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+    drwav_uint32 bytesPerFrame;
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+    
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_alaw_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s16__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+    drwav_uint32 bytesPerFrame;
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_mulaw_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    if (framesToRead * pWav->channels * sizeof(drwav_int16) > DRWAV_SIZE_MAX) {
+        framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int16) / pWav->channels;
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
+        return drwav_read_pcm_frames_s16__pcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return drwav_read_pcm_frames_s16__ieee(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
+        return drwav_read_pcm_frames_s16__alaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
+        return drwav_read_pcm_frames_s16__mulaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+        return drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return drwav_read_pcm_frames_s16__ima(pWav, framesToRead, pBufferOut);
+    }
+
+    return 0;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
+        drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
+        drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+
+DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = pIn[i];
+        r = x << 8;
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = ((int)(((unsigned int)(((const drwav_uint8*)pIn)[i*3+0]) << 8) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+1]) << 16) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+2])) << 24)) >> 8;
+        r = x >> 8;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = pIn[i];
+        r = x >> 16;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        float x = pIn[i];
+        float c;
+        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
+        c = c + 1;
+        r = (int)(c * 32767.5f);
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        double x = pIn[i];
+        double c;
+        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
+        c = c + 1;
+        r = (int)(c * 32767.5);
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        pOut[i] = drwav__alaw_to_s16(pIn[i]);
+    }
+}
+
+DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        pOut[i] = drwav__mulaw_to_s16(pIn[i]);
+    }
+}
+
+
+
+static void drwav__pcm_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
+{
+    unsigned int i;
+
+    /* Special case for 8-bit sample data because it's treated as unsigned. */
+    if (bytesPerSample == 1) {
+        drwav_u8_to_f32(pOut, pIn, sampleCount);
+        return;
+    }
+
+    /* Slightly more optimal implementation for common formats. */
+    if (bytesPerSample == 2) {
+        drwav_s16_to_f32(pOut, (const drwav_int16*)pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample == 3) {
+        drwav_s24_to_f32(pOut, pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        drwav_s32_to_f32(pOut, (const drwav_int32*)pIn, sampleCount);
+        return;
+    }
+
+
+    /* Anything more than 64 bits per sample is not supported. */
+    if (bytesPerSample > 8) {
+        DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
+        return;
+    }
+
+
+    /* Generic, slow converter. */
+    for (i = 0; i < sampleCount; ++i) {
+        drwav_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            DRWAV_ASSERT(j < 8);
+            sample |= (drwav_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+
+        pIn += j;
+        *pOut++ = (float)((drwav_int64)sample / 9223372036854775807.0);
+    }
+}
+
+static void drwav__ieee_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        unsigned int i;
+        for (i = 0; i < sampleCount; ++i) {
+            *pOut++ = ((const float*)pIn)[i];
+        }
+        return;
+    } else if (bytesPerSample == 8) {
+        drwav_f64_to_f32(pOut, (const double*)pIn, sampleCount);
+        return;
+    } else {
+        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
+        DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
+        return;
+    }
+}
+
+
+static drwav_uint64 drwav_read_pcm_frames_f32__pcm(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+
+    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav__pcm_to_f32(pBufferOut, sampleData, (size_t)framesRead*pWav->channels, bytesPerFrame/pWav->channels);
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_f32__msadpcm(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    /*
+    We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't
+    want to duplicate that code.
+    */
+    drwav_uint64 totalFramesRead = 0;
+    drwav_int16 samples16[2048];
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_f32__ima(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    /*
+    We're just going to borrow the implementation from the drwav_read_s16() since IMA-ADPCM is a little bit more complicated than other formats and I don't
+    want to duplicate that code.
+    */
+    drwav_uint64 totalFramesRead = 0;
+    drwav_int16 samples16[2048];
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_f32__ieee(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+    drwav_uint32 bytesPerFrame;
+
+    /* Fast path. */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bitsPerSample == 32) {
+        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+    
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav__ieee_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_f32__alaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_alaw_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_f32__mulaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+
+    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_mulaw_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    if (framesToRead * pWav->channels * sizeof(float) > DRWAV_SIZE_MAX) {
+        framesToRead = DRWAV_SIZE_MAX / sizeof(float) / pWav->channels;
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
+        return drwav_read_pcm_frames_f32__pcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+        return drwav_read_pcm_frames_f32__msadpcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return drwav_read_pcm_frames_f32__ieee(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
+        return drwav_read_pcm_frames_f32__alaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
+        return drwav_read_pcm_frames_f32__mulaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return drwav_read_pcm_frames_f32__ima(pWav, framesToRead, pBufferOut);
+    }
+
+    return 0;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
+        drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
+        drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+
+DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+#ifdef DR_WAV_LIBSNDFILE_COMPAT
+    /*
+    It appears libsndfile uses slightly different logic for the u8 -> f32 conversion to dr_wav, which in my opinion is incorrect. It appears
+    libsndfile performs the conversion something like "f32 = (u8 / 256) * 2 - 1", however I think it should be "f32 = (u8 / 255) * 2 - 1" (note
+    the divisor of 256 vs 255). I use libsndfile as a benchmark for testing, so I'm therefore leaving this block here just for my automated
+    correctness testing. This is disabled by default.
+    */
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (pIn[i] / 256.0f) * 2 - 1;
+    }
+#else
+    for (i = 0; i < sampleCount; ++i) {
+        float x = pIn[i];
+        x = x * 0.00784313725490196078f;    /* 0..255 to 0..2 */
+        x = x - 1;                          /* 0..2 to -1..1 */
+
+        *pOut++ = x;
+    }
+#endif
+}
+
+DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = pIn[i] * 0.000030517578125f;
+    }
+}
+
+DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        double x;
+        drwav_uint32 a = ((drwav_uint32)(pIn[i*3+0]) <<  8);
+        drwav_uint32 b = ((drwav_uint32)(pIn[i*3+1]) << 16);
+        drwav_uint32 c = ((drwav_uint32)(pIn[i*3+2]) << 24);
+
+        x = (double)((drwav_int32)(a | b | c) >> 8);
+        *pOut++ = (float)(x * 0.00000011920928955078125);
+    }
+}
+
+DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (float)(pIn[i] / 2147483648.0);
+    }
+}
+
+DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (float)pIn[i];
+    }
+}
+
+DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = drwav__alaw_to_s16(pIn[i]) / 32768.0f;
+    }
+}
+
+DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = drwav__mulaw_to_s16(pIn[i]) / 32768.0f;
+    }
+}
+
+
+
+static void drwav__pcm_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    unsigned int i;
+
+    /* Special case for 8-bit sample data because it's treated as unsigned. */
+    if (bytesPerSample == 1) {
+        drwav_u8_to_s32(pOut, pIn, totalSampleCount);
+        return;
+    }
+
+    /* Slightly more optimal implementation for common formats. */
+    if (bytesPerSample == 2) {
+        drwav_s16_to_s32(pOut, (const drwav_int16*)pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 3) {
+        drwav_s24_to_s32(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        for (i = 0; i < totalSampleCount; ++i) {
+           *pOut++ = ((const drwav_int32*)pIn)[i];
+        }
+        return;
+    }
+
+
+    /* Anything more than 64 bits per sample is not supported. */
+    if (bytesPerSample > 8) {
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+
+
+    /* Generic, slow converter. */
+    for (i = 0; i < totalSampleCount; ++i) {
+        drwav_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            DRWAV_ASSERT(j < 8);
+            sample |= (drwav_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+
+        pIn += j;
+        *pOut++ = (drwav_int32)((drwav_int64)sample >> 32);
+    }
+}
+
+static void drwav__ieee_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        drwav_f32_to_s32(pOut, (const float*)pIn, totalSampleCount);
+        return;
+    } else if (bytesPerSample == 8) {
+        drwav_f64_to_s32(pOut, (const double*)pIn, totalSampleCount);
+        return;
+    } else {
+        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+}
+
+
+static drwav_uint64 drwav_read_pcm_frames_s32__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+    drwav_uint32 bytesPerFrame;
+
+    /* Fast path. */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 32) {
+        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+    
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav__pcm_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s32__msadpcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    /*
+    We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't
+    want to duplicate that code.
+    */
+    drwav_uint64 totalFramesRead = 0;
+    drwav_int16 samples16[2048];
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s32__ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    /*
+    We're just going to borrow the implementation from the drwav_read_s16() since IMA-ADPCM is a little bit more complicated than other formats and I don't
+    want to duplicate that code.
+    */
+    drwav_uint64 totalFramesRead = 0;
+    drwav_int16 samples16[2048];
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s32__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+
+    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav__ieee_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s32__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+
+    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_alaw_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+static drwav_uint64 drwav_read_pcm_frames_s32__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096];
+
+    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        drwav_mulaw_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    if (framesToRead * pWav->channels * sizeof(drwav_int32) > DRWAV_SIZE_MAX) {
+        framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int32) / pWav->channels;
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
+        return drwav_read_pcm_frames_s32__pcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+        return drwav_read_pcm_frames_s32__msadpcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return drwav_read_pcm_frames_s32__ieee(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
+        return drwav_read_pcm_frames_s32__alaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
+        return drwav_read_pcm_frames_s32__mulaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return drwav_read_pcm_frames_s32__ima(pWav, framesToRead, pBufferOut);
+    }
+
+    return 0;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
+        drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
+        drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+
+DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ((int)pIn[i] - 128) << 24;
+    }
+}
+
+DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = pIn[i] << 16;
+    }
+}
+
+DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        unsigned int s0 = pIn[i*3 + 0];
+        unsigned int s1 = pIn[i*3 + 1];
+        unsigned int s2 = pIn[i*3 + 2];
+
+        drwav_int32 sample32 = (drwav_int32)((s0 << 8) | (s1 << 16) | (s2 << 24));
+        *pOut++ = sample32;
+    }
+}
+
+DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (drwav_int32)(2147483648.0 * pIn[i]);
+    }
+}
+
+DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (drwav_int32)(2147483648.0 * pIn[i]);
+    }
+}
+
+DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ((drwav_int32)drwav__alaw_to_s16(pIn[i])) << 16;
+    }
+}
+
+DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i= 0; i < sampleCount; ++i) {
+        *pOut++ = ((drwav_int32)drwav__mulaw_to_s16(pIn[i])) << 16;
+    }
+}
+
+
+
+static drwav_int16* drwav__read_pcm_frames_and_close_s16(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
+{
+    drwav_uint64 sampleDataSize;
+    drwav_int16* pSampleData;
+    drwav_uint64 framesRead;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int16);
+    if (sampleDataSize > DRWAV_SIZE_MAX) {
+        drwav_uninit(pWav);
+        return NULL;    /* File's too big. */
+    }
+
+    pSampleData = (drwav_int16*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
+    if (pSampleData == NULL) {
+        drwav_uninit(pWav);
+        return NULL;    /* Failed to allocate memory. */
+    }
+
+    framesRead = drwav_read_pcm_frames_s16(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        drwav_uninit(pWav);
+        return NULL;    /* There was an error reading the samples. */
+    }
+
+    drwav_uninit(pWav);
+
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+
+    return pSampleData;
+}
+
+static float* drwav__read_pcm_frames_and_close_f32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
+{
+    drwav_uint64 sampleDataSize;
+    float* pSampleData;
+    drwav_uint64 framesRead;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(float);
+    if (sampleDataSize > DRWAV_SIZE_MAX) {
+        drwav_uninit(pWav);
+        return NULL;    /* File's too big. */
+    }
+
+    pSampleData = (float*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
+    if (pSampleData == NULL) {
+        drwav_uninit(pWav);
+        return NULL;    /* Failed to allocate memory. */
+    }
+
+    framesRead = drwav_read_pcm_frames_f32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        drwav_uninit(pWav);
+        return NULL;    /* There was an error reading the samples. */
+    }
+
+    drwav_uninit(pWav);
+
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+
+    return pSampleData;
+}
+
+static drwav_int32* drwav__read_pcm_frames_and_close_s32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
+{
+    drwav_uint64 sampleDataSize;
+    drwav_int32* pSampleData;
+    drwav_uint64 framesRead;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int32);
+    if (sampleDataSize > DRWAV_SIZE_MAX) {
+        drwav_uninit(pWav);
+        return NULL;    /* File's too big. */
+    }
+
+    pSampleData = (drwav_int32*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
+    if (pSampleData == NULL) {
+        drwav_uninit(pWav);
+        return NULL;    /* Failed to allocate memory. */
+    }
+
+    framesRead = drwav_read_pcm_frames_s32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        drwav_uninit(pWav);
+        return NULL;    /* There was an error reading the samples. */
+    }
+
+    drwav_uninit(pWav);
+
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+
+    return pSampleData;
+}
+
+
+
+DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+#ifndef DR_WAV_NO_STDIO
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#endif
+
+DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#endif  /* DR_WAV_NO_CONVERSION_API */
+
+
+DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        drwav__free_from_callbacks(p, pAllocationCallbacks);
+    } else {
+        drwav__free_default(p, NULL);
+    }
+}
+
+DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data)
+{
+    return drwav__bytes_to_u16(data);
+}
+
+DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data)
+{
+    return drwav__bytes_to_s16(data);
+}
+
+DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data)
+{
+    return drwav__bytes_to_u32(data);
+}
+
+DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data)
+{
+    return drwav__bytes_to_s32(data);
+}
+
+DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data)
+{
+    return drwav__bytes_to_u64(data);
+}
+
+DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data)
+{
+    return drwav__bytes_to_s64(data);
+}
+
+
+DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16])
+{
+    return drwav__guid_equal(a, b);
+}
+
+DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b)
+{
+    return drwav__fourcc_equal(a, b);
+}
+
+#endif  /* dr_wav_c */
+#endif  /* DR_WAV_IMPLEMENTATION */
+
+/*
+RELEASE NOTES - v0.11.0
+=======================
+Version 0.11.0 has breaking API changes.
+
+Improved Client-Defined Memory Allocation
+-----------------------------------------
+The main change with this release is the addition of a more flexible way of implementing custom memory allocation routines. The
+existing system of DRWAV_MALLOC, DRWAV_REALLOC and DRWAV_FREE are still in place and will be used by default when no custom
+allocation callbacks are specified.
+
+To use the new system, you pass in a pointer to a drwav_allocation_callbacks object to drwav_init() and family, like this:
+
+    void* my_malloc(size_t sz, void* pUserData)
+    {
+        return malloc(sz);
+    }
+    void* my_realloc(void* p, size_t sz, void* pUserData)
+    {
+        return realloc(p, sz);
+    }
+    void my_free(void* p, void* pUserData)
+    {
+        free(p);
+    }
+
+    ...
+
+    drwav_allocation_callbacks allocationCallbacks;
+    allocationCallbacks.pUserData = &myData;
+    allocationCallbacks.onMalloc  = my_malloc;
+    allocationCallbacks.onRealloc = my_realloc;
+    allocationCallbacks.onFree    = my_free;
+    drwav_init_file(&wav, "my_file.wav", &allocationCallbacks);
+
+The advantage of this new system is that it allows you to specify user data which will be passed in to the allocation routines.
+
+Passing in null for the allocation callbacks object will cause dr_wav to use defaults which is the same as DRWAV_MALLOC,
+DRWAV_REALLOC and DRWAV_FREE and the equivalent of how it worked in previous versions.
+
+Every API that opens a drwav object now takes this extra parameter. These include the following:
+
+    drwav_init()
+    drwav_init_ex()
+    drwav_init_file()
+    drwav_init_file_ex()
+    drwav_init_file_w()
+    drwav_init_file_w_ex()
+    drwav_init_memory()
+    drwav_init_memory_ex()
+    drwav_init_write()
+    drwav_init_write_sequential()
+    drwav_init_write_sequential_pcm_frames()
+    drwav_init_file_write()
+    drwav_init_file_write_sequential()
+    drwav_init_file_write_sequential_pcm_frames()
+    drwav_init_file_write_w()
+    drwav_init_file_write_sequential_w()
+    drwav_init_file_write_sequential_pcm_frames_w()
+    drwav_init_memory_write()
+    drwav_init_memory_write_sequential()
+    drwav_init_memory_write_sequential_pcm_frames()
+    drwav_open_and_read_pcm_frames_s16()
+    drwav_open_and_read_pcm_frames_f32()
+    drwav_open_and_read_pcm_frames_s32()
+    drwav_open_file_and_read_pcm_frames_s16()
+    drwav_open_file_and_read_pcm_frames_f32()
+    drwav_open_file_and_read_pcm_frames_s32()
+    drwav_open_file_and_read_pcm_frames_s16_w()
+    drwav_open_file_and_read_pcm_frames_f32_w()
+    drwav_open_file_and_read_pcm_frames_s32_w()
+    drwav_open_memory_and_read_pcm_frames_s16()
+    drwav_open_memory_and_read_pcm_frames_f32()
+    drwav_open_memory_and_read_pcm_frames_s32()
+
+Endian Improvements
+-------------------
+Previously, the following APIs returned little-endian audio data. These now return native-endian data. This improves compatibility
+on big-endian architectures.
+
+    drwav_read_pcm_frames()
+    drwav_read_pcm_frames_s16()
+    drwav_read_pcm_frames_s32()
+    drwav_read_pcm_frames_f32()
+    drwav_open_and_read_pcm_frames_s16()
+    drwav_open_and_read_pcm_frames_s32()
+    drwav_open_and_read_pcm_frames_f32()
+    drwav_open_file_and_read_pcm_frames_s16()
+    drwav_open_file_and_read_pcm_frames_s32()
+    drwav_open_file_and_read_pcm_frames_f32()
+    drwav_open_file_and_read_pcm_frames_s16_w()
+    drwav_open_file_and_read_pcm_frames_s32_w()
+    drwav_open_file_and_read_pcm_frames_f32_w()
+    drwav_open_memory_and_read_pcm_frames_s16()
+    drwav_open_memory_and_read_pcm_frames_s32()
+    drwav_open_memory_and_read_pcm_frames_f32()
+
+APIs have been added to give you explicit control over whether or not audio data is read or written in big- or little-endian byte
+order:
+
+    drwav_read_pcm_frames_le()
+    drwav_read_pcm_frames_be()
+    drwav_read_pcm_frames_s16le()
+    drwav_read_pcm_frames_s16be()
+    drwav_read_pcm_frames_f32le()
+    drwav_read_pcm_frames_f32be()
+    drwav_read_pcm_frames_s32le()
+    drwav_read_pcm_frames_s32be()
+    drwav_write_pcm_frames_le()
+    drwav_write_pcm_frames_be()
+
+Removed APIs
+------------
+The following APIs were deprecated in version 0.10.0 and have now been removed:
+
+    drwav_open()
+    drwav_open_ex()
+    drwav_open_write()
+    drwav_open_write_sequential()
+    drwav_open_file()
+    drwav_open_file_ex()
+    drwav_open_file_write()
+    drwav_open_file_write_sequential()
+    drwav_open_memory()
+    drwav_open_memory_ex()
+    drwav_open_memory_write()
+    drwav_open_memory_write_sequential()
+    drwav_close()
+
+
+
+RELEASE NOTES - v0.10.0
+=======================
+Version 0.10.0 has breaking API changes. There are no significant bug fixes in this release, so if you are affected you do
+not need to upgrade.
+
+Removed APIs
+------------
+The following APIs were deprecated in version 0.9.0 and have been completely removed in version 0.10.0:
+
+    drwav_read()
+    drwav_read_s16()
+    drwav_read_f32()
+    drwav_read_s32()
+    drwav_seek_to_sample()
+    drwav_write()
+    drwav_open_and_read_s16()
+    drwav_open_and_read_f32()
+    drwav_open_and_read_s32()
+    drwav_open_file_and_read_s16()
+    drwav_open_file_and_read_f32()
+    drwav_open_file_and_read_s32()
+    drwav_open_memory_and_read_s16()
+    drwav_open_memory_and_read_f32()
+    drwav_open_memory_and_read_s32()
+    drwav::totalSampleCount
+
+See release notes for version 0.9.0 at the bottom of this file for replacement APIs.
+
+Deprecated APIs
+---------------
+The following APIs have been deprecated. There is a confusing and completely arbitrary difference between drwav_init*() and
+drwav_open*(), where drwav_init*() initializes a pre-allocated drwav object, whereas drwav_open*() will first allocated a
+drwav object on the heap and then initialize it. drwav_open*() has been deprecated which means you must now use a pre-
+allocated drwav object with drwav_init*(). If you need the previous functionality, you can just do a malloc() followed by
+a called to one of the drwav_init*() APIs.
+
+    drwav_open()
+    drwav_open_ex()
+    drwav_open_write()
+    drwav_open_write_sequential()
+    drwav_open_file()
+    drwav_open_file_ex()
+    drwav_open_file_write()
+    drwav_open_file_write_sequential()
+    drwav_open_memory()
+    drwav_open_memory_ex()
+    drwav_open_memory_write()
+    drwav_open_memory_write_sequential()
+    drwav_close()
+
+These APIs will be removed completely in a future version. The rationale for this change is to remove confusion between the
+two different ways to initialize a drwav object.
+*/
+
+/*
+REVISION HISTORY
+================
+v0.12.16 - 2020-12-02
+  - Fix a bug when trying to read more bytes than can fit in a size_t.
+
+v0.12.15 - 2020-11-21
+  - Fix compilation with OpenWatcom.
+
+v0.12.14 - 2020-11-13
+  - Minor code clean up.
+
+v0.12.13 - 2020-11-01
+  - Improve compiler support for older versions of GCC.
+
+v0.12.12 - 2020-09-28
+  - Add support for RF64.
+  - Fix a bug in writing mode where the size of the RIFF chunk incorrectly includes the header section.
+
+v0.12.11 - 2020-09-08
+  - Fix a compilation error on older compilers.
+
+v0.12.10 - 2020-08-24
+  - Fix a bug when seeking with ADPCM formats.
+
+v0.12.9 - 2020-08-02
+  - Simplify sized types.
+
+v0.12.8 - 2020-07-25
+  - Fix a compilation warning.
+
+v0.12.7 - 2020-07-15
+  - Fix some bugs on big-endian architectures.
+  - Fix an error in s24 to f32 conversion.
+
+v0.12.6 - 2020-06-23
+  - Change drwav_read_*() to allow NULL to be passed in as the output buffer which is equivalent to a forward seek.
+  - Fix a buffer overflow when trying to decode invalid IMA-ADPCM files.
+  - Add include guard for the implementation section.
+
+v0.12.5 - 2020-05-27
+  - Minor documentation fix.
+
+v0.12.4 - 2020-05-16
+  - Replace assert() with DRWAV_ASSERT().
+  - Add compile-time and run-time version querying.
+    - DRWAV_VERSION_MINOR
+    - DRWAV_VERSION_MAJOR
+    - DRWAV_VERSION_REVISION
+    - DRWAV_VERSION_STRING
+    - drwav_version()
+    - drwav_version_string()
+
+v0.12.3 - 2020-04-30
+  - Fix compilation errors with VC6.
+
+v0.12.2 - 2020-04-21
+  - Fix a bug where drwav_init_file() does not close the file handle after attempting to load an erroneous file.
+
+v0.12.1 - 2020-04-13
+  - Fix some pedantic warnings.
+
+v0.12.0 - 2020-04-04
+  - API CHANGE: Add container and format parameters to the chunk callback.
+  - Minor documentation updates.
+
+v0.11.5 - 2020-03-07
+  - Fix compilation error with Visual Studio .NET 2003.
+
+v0.11.4 - 2020-01-29
+  - Fix some static analysis warnings.
+  - Fix a bug when reading f32 samples from an A-law encoded stream.
+
+v0.11.3 - 2020-01-12
+  - Minor changes to some f32 format conversion routines.
+  - Minor bug fix for ADPCM conversion when end of file is reached.
+
+v0.11.2 - 2019-12-02
+  - Fix a possible crash when using custom memory allocators without a custom realloc() implementation.
+  - Fix an integer overflow bug.
+  - Fix a null pointer dereference bug.
+  - Add limits to sample rate, channels and bits per sample to tighten up some validation.
+
+v0.11.1 - 2019-10-07
+  - Internal code clean up.
+
+v0.11.0 - 2019-10-06
+  - API CHANGE: Add support for user defined memory allocation routines. This system allows the program to specify their own memory allocation
+    routines with a user data pointer for client-specific contextual data. This adds an extra parameter to the end of the following APIs:
+    - drwav_init()
+    - drwav_init_ex()
+    - drwav_init_file()
+    - drwav_init_file_ex()
+    - drwav_init_file_w()
+    - drwav_init_file_w_ex()
+    - drwav_init_memory()
+    - drwav_init_memory_ex()
+    - drwav_init_write()
+    - drwav_init_write_sequential()
+    - drwav_init_write_sequential_pcm_frames()
+    - drwav_init_file_write()
+    - drwav_init_file_write_sequential()
+    - drwav_init_file_write_sequential_pcm_frames()
+    - drwav_init_file_write_w()
+    - drwav_init_file_write_sequential_w()
+    - drwav_init_file_write_sequential_pcm_frames_w()
+    - drwav_init_memory_write()
+    - drwav_init_memory_write_sequential()
+    - drwav_init_memory_write_sequential_pcm_frames()
+    - drwav_open_and_read_pcm_frames_s16()
+    - drwav_open_and_read_pcm_frames_f32()
+    - drwav_open_and_read_pcm_frames_s32()
+    - drwav_open_file_and_read_pcm_frames_s16()
+    - drwav_open_file_and_read_pcm_frames_f32()
+    - drwav_open_file_and_read_pcm_frames_s32()
+    - drwav_open_file_and_read_pcm_frames_s16_w()
+    - drwav_open_file_and_read_pcm_frames_f32_w()
+    - drwav_open_file_and_read_pcm_frames_s32_w()
+    - drwav_open_memory_and_read_pcm_frames_s16()
+    - drwav_open_memory_and_read_pcm_frames_f32()
+    - drwav_open_memory_and_read_pcm_frames_s32()
+    Set this extra parameter to NULL to use defaults which is the same as the previous behaviour. Setting this NULL will use
+    DRWAV_MALLOC, DRWAV_REALLOC and DRWAV_FREE.
+  - Add support for reading and writing PCM frames in an explicit endianness. New APIs:
+    - drwav_read_pcm_frames_le()
+    - drwav_read_pcm_frames_be()
+    - drwav_read_pcm_frames_s16le()
+    - drwav_read_pcm_frames_s16be()
+    - drwav_read_pcm_frames_f32le()
+    - drwav_read_pcm_frames_f32be()
+    - drwav_read_pcm_frames_s32le()
+    - drwav_read_pcm_frames_s32be()
+    - drwav_write_pcm_frames_le()
+    - drwav_write_pcm_frames_be()
+  - Remove deprecated APIs.
+  - API CHANGE: The following APIs now return native-endian data. Previously they returned little-endian data.
+    - drwav_read_pcm_frames()
+    - drwav_read_pcm_frames_s16()
+    - drwav_read_pcm_frames_s32()
+    - drwav_read_pcm_frames_f32()
+    - drwav_open_and_read_pcm_frames_s16()
+    - drwav_open_and_read_pcm_frames_s32()
+    - drwav_open_and_read_pcm_frames_f32()
+    - drwav_open_file_and_read_pcm_frames_s16()
+    - drwav_open_file_and_read_pcm_frames_s32()
+    - drwav_open_file_and_read_pcm_frames_f32()
+    - drwav_open_file_and_read_pcm_frames_s16_w()
+    - drwav_open_file_and_read_pcm_frames_s32_w()
+    - drwav_open_file_and_read_pcm_frames_f32_w()
+    - drwav_open_memory_and_read_pcm_frames_s16()
+    - drwav_open_memory_and_read_pcm_frames_s32()
+    - drwav_open_memory_and_read_pcm_frames_f32()
+
+v0.10.1 - 2019-08-31
+  - Correctly handle partial trailing ADPCM blocks.
+
+v0.10.0 - 2019-08-04
+  - Remove deprecated APIs.
+  - Add wchar_t variants for file loading APIs:
+      drwav_init_file_w()
+      drwav_init_file_ex_w()
+      drwav_init_file_write_w()
+      drwav_init_file_write_sequential_w()
+  - Add drwav_target_write_size_bytes() which calculates the total size in bytes of a WAV file given a format and sample count.
+  - Add APIs for specifying the PCM frame count instead of the sample count when opening in sequential write mode:
+      drwav_init_write_sequential_pcm_frames()
+      drwav_init_file_write_sequential_pcm_frames()
+      drwav_init_file_write_sequential_pcm_frames_w()
+      drwav_init_memory_write_sequential_pcm_frames()
+  - Deprecate drwav_open*() and drwav_close():
+      drwav_open()
+      drwav_open_ex()
+      drwav_open_write()
+      drwav_open_write_sequential()
+      drwav_open_file()
+      drwav_open_file_ex()
+      drwav_open_file_write()
+      drwav_open_file_write_sequential()
+      drwav_open_memory()
+      drwav_open_memory_ex()
+      drwav_open_memory_write()
+      drwav_open_memory_write_sequential()
+      drwav_close()
+  - Minor documentation updates.
+
+v0.9.2 - 2019-05-21
+  - Fix warnings.
+
+v0.9.1 - 2019-05-05
+  - Add support for C89.
+  - Change license to choice of public domain or MIT-0.
+
+v0.9.0 - 2018-12-16
+  - API CHANGE: Add new reading APIs for reading by PCM frames instead of samples. Old APIs have been deprecated and
+    will be removed in v0.10.0. Deprecated APIs and their replacements:
+      drwav_read()                     -> drwav_read_pcm_frames()
+      drwav_read_s16()                 -> drwav_read_pcm_frames_s16()
+      drwav_read_f32()                 -> drwav_read_pcm_frames_f32()
+      drwav_read_s32()                 -> drwav_read_pcm_frames_s32()
+      drwav_seek_to_sample()           -> drwav_seek_to_pcm_frame()
+      drwav_write()                    -> drwav_write_pcm_frames()
+      drwav_open_and_read_s16()        -> drwav_open_and_read_pcm_frames_s16()
+      drwav_open_and_read_f32()        -> drwav_open_and_read_pcm_frames_f32()
+      drwav_open_and_read_s32()        -> drwav_open_and_read_pcm_frames_s32()
+      drwav_open_file_and_read_s16()   -> drwav_open_file_and_read_pcm_frames_s16()
+      drwav_open_file_and_read_f32()   -> drwav_open_file_and_read_pcm_frames_f32()
+      drwav_open_file_and_read_s32()   -> drwav_open_file_and_read_pcm_frames_s32()
+      drwav_open_memory_and_read_s16() -> drwav_open_memory_and_read_pcm_frames_s16()
+      drwav_open_memory_and_read_f32() -> drwav_open_memory_and_read_pcm_frames_f32()
+      drwav_open_memory_and_read_s32() -> drwav_open_memory_and_read_pcm_frames_s32()
+      drwav::totalSampleCount          -> drwav::totalPCMFrameCount
+  - API CHANGE: Rename drwav_open_and_read_file_*() to drwav_open_file_and_read_*().
+  - API CHANGE: Rename drwav_open_and_read_memory_*() to drwav_open_memory_and_read_*().
+  - Add built-in support for smpl chunks.
+  - Add support for firing a callback for each chunk in the file at initialization time.
+    - This is enabled through the drwav_init_ex(), etc. family of APIs.
+  - Handle invalid FMT chunks more robustly.
+
+v0.8.5 - 2018-09-11
+  - Const correctness.
+  - Fix a potential stack overflow.
+
+v0.8.4 - 2018-08-07
+  - Improve 64-bit detection.
+
+v0.8.3 - 2018-08-05
+  - Fix C++ build on older versions of GCC.
+
+v0.8.2 - 2018-08-02
+  - Fix some big-endian bugs.
+
+v0.8.1 - 2018-06-29
+  - Add support for sequential writing APIs.
+  - Disable seeking in write mode.
+  - Fix bugs with Wave64.
+  - Fix typos.
+
+v0.8 - 2018-04-27
+  - Bug fix.
+  - Start using major.minor.revision versioning.
+
+v0.7f - 2018-02-05
+  - Restrict ADPCM formats to a maximum of 2 channels.
+
+v0.7e - 2018-02-02
+  - Fix a crash.
+
+v0.7d - 2018-02-01
+  - Fix a crash.
+
+v0.7c - 2018-02-01
+  - Set drwav.bytesPerSample to 0 for all compressed formats.
+  - Fix a crash when reading 16-bit floating point WAV files. In this case dr_wav will output silence for
+    all format conversion reading APIs (*_s16, *_s32, *_f32 APIs).
+  - Fix some divide-by-zero errors.
+
+v0.7b - 2018-01-22
+  - Fix errors with seeking of compressed formats.
+  - Fix compilation error when DR_WAV_NO_CONVERSION_API
+
+v0.7a - 2017-11-17
+  - Fix some GCC warnings.
+
+v0.7 - 2017-11-04
+  - Add writing APIs.
+
+v0.6 - 2017-08-16
+  - API CHANGE: Rename dr_* types to drwav_*.
+  - Add support for custom implementations of malloc(), realloc(), etc.
+  - Add support for Microsoft ADPCM.
+  - Add support for IMA ADPCM (DVI, format code 0x11).
+  - Optimizations to drwav_read_s16().
+  - Bug fixes.
+
+v0.5g - 2017-07-16
+  - Change underlying type for booleans to unsigned.
+
+v0.5f - 2017-04-04
+  - Fix a minor bug with drwav_open_and_read_s16() and family.
+
+v0.5e - 2016-12-29
+  - Added support for reading samples as signed 16-bit integers. Use the _s16() family of APIs for this.
+  - Minor fixes to documentation.
+
+v0.5d - 2016-12-28
+  - Use drwav_int* and drwav_uint* sized types to improve compiler support.
+
+v0.5c - 2016-11-11
+  - Properly handle JUNK chunks that come before the FMT chunk.
+
+v0.5b - 2016-10-23
+  - A minor change to drwav_bool8 and drwav_bool32 types.
+
+v0.5a - 2016-10-11
+  - Fixed a bug with drwav_open_and_read() and family due to incorrect argument ordering.
+  - Improve A-law and mu-law efficiency.
+
+v0.5 - 2016-09-29
+  - API CHANGE. Swap the order of "channels" and "sampleRate" parameters in drwav_open_and_read*(). Rationale for this is to
+    keep it consistent with dr_audio and dr_flac.
+
+v0.4b - 2016-09-18
+  - Fixed a typo in documentation.
+
+v0.4a - 2016-09-18
+  - Fixed a typo.
+  - Change date format to ISO 8601 (YYYY-MM-DD)
+
+v0.4 - 2016-07-13
+  - API CHANGE. Make onSeek consistent with dr_flac.
+  - API CHANGE. Rename drwav_seek() to drwav_seek_to_sample() for clarity and consistency with dr_flac.
+  - Added support for Sony Wave64.
+
+v0.3a - 2016-05-28
+  - API CHANGE. Return drwav_bool32 instead of int in onSeek callback.
+  - Fixed a memory leak.
+
+v0.3 - 2016-05-22
+  - Lots of API changes for consistency.
+
+v0.2a - 2016-05-16
+  - Fixed Linux/GCC build.
+
+v0.2 - 2016-05-11
+  - Added support for reading data as signed 32-bit PCM for consistency with dr_flac.
+
+v0.1a - 2016-05-07
+  - Fixed a bug in drwav_open_file() where the file handle would not be closed if the loader failed to initialize.
+
+v0.1 - 2016-05-04
+  - Initial versioned release.
+*/
+
+/*
+This software is available as a choice of the following licenses. Choose
+whichever you prefer.
+
+===============================================================================
+ALTERNATIVE 1 - Public Domain (www.unlicense.org)
+===============================================================================
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+
+===============================================================================
+ALTERNATIVE 2 - MIT No Attribution
+===============================================================================
+Copyright 2020 David Reid
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/gpt-2/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..af9cb4ef96c1c94c3b1a74776e78b64c0b5f63c5
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# gpt-2
+
+set(TEST_TARGET gpt-2)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# gpt-2-quantize
+
+set(TEST_TARGET gpt-2-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# gpt-2-batched
+
+set(TEST_TARGET gpt-2-batched)
+add_executable(${TEST_TARGET} main-batched.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+
+#
+# For GPU offloading
+
+if (GGML_CUBLAS)
+    add_compile_definitions(GGML_USE_CUBLAS)
+endif()
+
+if (GGML_CLBLAST)
+    add_compile_definitions(GGML_USE_CLBLAST)
+endif()
+
+if (GGML_METAL)
+    add_compile_definitions(GGML_USE_METAL)
+endif()
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/README.md b/stable-diffusion.cpp/ggml/examples/gpt-2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..45c932c903bcfcf3e5955f8d0a127fae0beed9ee
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/README.md
@@ -0,0 +1,225 @@
+# gpt-2
+
+This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.
+
+The program runs on the CPU - no video card is required.
+
+The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.
+
+The example supports the following GPT-2 models:
+
+| Model | Description  | Disk Size |
+| ---   | ---          | ---       |
+| 117M  | Small model  | 240 MB    |
+| 345M  | Medium model | 680 MB    |
+| 774M  | Large model  | 1.5 GB    |
+| 1558M | XL model     | 3.0 GB    |
+
+Sample performance on MacBook M1 Pro:
+
+| Model | Size  | Time / Token |
+| ---   | ---   | ---    |
+| GPT-2 |  117M |   5 ms |
+| GPT-2 |  345M |  12 ms |
+| GPT-2 |  774M |  23 ms |
+| GPT-2 | 1558M |  42 ms |
+
+*TODO: add tables for Cerebras-GPT models*
+
+Sample output:
+
+```
+$ ./bin/gpt-2 -h
+usage: ./bin/gpt-2 [options]
+
+options:
+  -h, --help            show this help message and exit
+  -s SEED, --seed SEED  RNG seed (default: -1)
+  -t N, --threads N     number of threads to use during computation (default: 8)
+  -p PROMPT, --prompt PROMPT
+                        prompt to start generation with (default: random)
+  -n N, --n_predict N   number of tokens to predict (default: 200)
+  --top_k N             top-k sampling (default: 40)
+  --top_p N             top-p sampling (default: 0.9)
+  --temp N              temperature (default: 1.0)
+  -b N, --batch_size N  batch size for prompt processing (default: 8)
+  -m FNAME, --model FNAME
+                        model path (default: models/gpt-2-117M/ggml-model.bin)
+
+$ ./bin/gpt-2
+gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
+gpt2_model_load: n_vocab = 50257
+gpt2_model_load: n_ctx   = 1024
+gpt2_model_load: n_embd  = 768
+gpt2_model_load: n_head  = 12
+gpt2_model_load: n_layer = 12
+gpt2_model_load: f16     = 1
+gpt2_model_load: ggml ctx size = 311.12 MB
+gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
+gpt2_model_load: model size  =   239.08 MB
+main: number of tokens in prompt = 1
+
+So this is going to be the end of the line for us.
+
+If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.
+
+Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.
+
+We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>
+
+main: mem per token =  2048612 bytes
+main:     load time =   106.32 ms
+main:   sample time =     7.10 ms
+main:  predict time =   506.40 ms / 5.06 ms per token
+main:    total time =   629.84 ms
+```
+
+## Downloading and converting the original models (GPT-2)
+
+You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
+in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
+via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.
+
+Here is the entire process for the GPT-2 117M model (download from official site + conversion):
+
+```
+cd ggml/build
+../examples/gpt-2/download-model.sh 117M
+
+Downloading model 117M ...
+models/gpt-2-117M/checkpoint                      100%[=============================>]      77  --.-KB/s    in 0s
+models/gpt-2-117M/encoder.json                    100%[=============================>]   1018K  1.20MB/s    in 0.8s
+models/gpt-2-117M/hparams.json                    100%[=============================>]      90  --.-KB/s    in 0s
+models/gpt-2-117M/model.ckpt.data-00000-of-00001  100%[=============================>] 474.70M  1.21MB/s    in 8m 39s
+models/gpt-2-117M/model.ckpt.index                100%[=============================>]   5.09K  --.-KB/s    in 0s
+models/gpt-2-117M/model.ckpt.meta                 100%[=============================>] 460.11K   806KB/s    in 0.6s
+models/gpt-2-117M/vocab.bpe                       100%[=============================>] 445.62K   799KB/s    in 0.6s
+Done! Model '117M' saved in 'models/gpt-2-117M/'
+
+Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.
+
+  python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1
+
+```
+
+This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
+this, you can download the already converted ggml models as described below.
+
+## Downloading and converting the original models (Cerebras-GPT)
+
+Clone the respective repository from here: https://huggingface.co/cerebras
+
+Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:
+
+```
+cd ggml/build
+git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
+python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/
+
+```
+
+## Downloading the ggml model directly (GPT-2)
+
+For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
+way, you can directly download a single binary file and start using it. No python or Tensorflow is required.
+
+Here is how to get the 117M ggml model:
+
+```
+cd ggml/build
+../examples/gpt-2/download-ggml-model.sh 117M
+
+Downloading ggml model 117M ...
+models/gpt-2-117M/ggml-model.bin         100%[===============================>] 239.58M  8.52MB/s    in 28s
+Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
+You can now use it like this:
+
+  $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
+
+```
+
+At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.
+
+## Quantizing the models
+
+You can also try to quantize the `ggml` models via 4-bit integer quantization.
+Keep in mind that for smaller models, this will render them completely useless.
+You generally want to quantize larger models.
+
+```
+# quantize GPT-2 F16 to Q4_0 (faster but less precise)
+./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
+./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"
+
+# quantize Cerebras F16 to Q4_1 (slower but more precise)
+./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
+./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"
+
+```
+
+## Batched generation example
+
+You can try the batched generation from a given prompt using the gpt-2-batched binary.
+
+Sample output:
+
+```
+$ gpt-2-batched -np 5 -m models/gpt-2-117M/ggml-model.bin -p "Hello my name is" -n 50
+
+main: seed = 1697037431
+gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
+gpt2_model_load: n_vocab = 50257
+gpt2_model_load: n_ctx   = 1024
+gpt2_model_load: n_embd  = 768
+gpt2_model_load: n_head  = 12
+gpt2_model_load: n_layer = 12
+gpt2_model_load: ftype   = 1
+gpt2_model_load: qntvr   = 0
+gpt2_model_load: ggml tensor size    = 320 bytes
+gpt2_model_load: backend buffer size = 312.72 MB
+ggml_init_cublas: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce GTX 1660, compute capability 7.5
+gpt2_model_load: using CPU backend
+gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
+gpt2_model_load: model size  =   239.08 MB
+extract_tests_from_file : No test file found.
+test_gpt_tokenizer : 0 tests failed out of 0 tests.
+main: compute buffer size: 3.26 MB
+
+
+main: generating 5 sequences ...
+main: prompt: 'Hello my name is'
+main: number of tokens in prompt = 4, first 8 tokens: 15496 616 1438 318
+
+
+sequence 0:
+
+Hello my name is John. You can call me any way you want, if you want, but for my very first date, I will be on the phone with you. We're both in our early 20s, but I feel like it's all
+
+sequence 1:
+
+Hello my name is Robert, and I want to say that we're proud to have your company here on the world's largest platform for sharing your stories with us. This is a huge opportunity for our community. We have hundreds of people on this team and
+
+sequence 2:
+
+Hello my name is Jack. I'm the one who created you.
+
+Jack is a boy with a big smile and a big heart. He is a handsome guy. He loves the outdoors and loves the people he meets. He wants to be a
+
+sequence 3:
+
+Hello my name is John. I am a Canadian citizen with a large number of family in Quebec and I am interested in studying. My aim is to take up a post in the Journal of the International Academy of Sciences of Canada which I am currently finishing.
+
+sequence 4:
+
+Hello my name is Dan. I am an entrepreneur. I am a great father. I am a great husband. I am a great husband. I am a great dad. And I am a great husband.
+
+I love my life. I love
+
+
+
+main:     load time =   880.80 ms
+main:   sample time =    91.43 ms
+main:  predict time =  2518.29 ms
+main:    total time =  3544.32 ms
+```
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/convert-cerebras-to-ggml.py b/stable-diffusion.cpp/ggml/examples/gpt-2/convert-cerebras-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..6057f81ce34d54b116d4afee205aecd728ea9027
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/convert-cerebras-to-ggml.py
@@ -0,0 +1,183 @@
+# Convert Cerebras models to ggml format
+#
+# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
+#
+
+import sys
+import struct
+import json
+import torch
+import numpy as np
+import re
+
+from transformers import AutoModelForCausalLM
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+if len(sys.argv) < 2:
+    print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model-f16.bin"
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 2:
+    use_f16 = False
+    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
+
+model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
+#print (model)
+
+list_vars = model.state_dict()
+#print (list_vars)
+
+print(hparams)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["n_positions"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+fout.write(struct.pack("i", use_f16))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(encoder)))
+
+for key in encoder:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # rename headers to keep compatibility
+    if name == "transformer.ln_f.weight":
+        name = "model/ln_f/g"
+    elif name == "transformer.ln_f.bias":
+        name = "model/ln_f/b"
+    elif name == "transformer.wte.weight":
+        name = "model/wte"
+    elif name == "transformer.wpe.weight":
+        name = "model/wpe"
+    elif name == "lm_head.weight":
+        name = "model/lm_head"
+    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/g"
+    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/b"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/w"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/b"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/w"
+    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/b"
+    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/g"
+    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/b"
+    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/w"
+    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/b"
+    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/w"
+    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/b"
+    else:
+        print("Unrecognized variable name. %s", name)
+
+    # we don't need these
+    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape);
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0;
+    if use_f16:
+        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+
+    # for efficiency - transpose the projection matrices
+    # "model/h.*/attn/c_attn/w"
+    # "model/h.*/attn/c_proj/w"
+    # "model/h.*/mlp/c_fc/w"
+    # "model/h.*/mlp/c_proj/w"
+    if name[-14:] == "/attn/c_attn/w" or \
+       name[-14:] == "/attn/c_proj/w" or \
+       name[-11:] == "/mlp/c_fc/w" or \
+       name[-13:] == "/mlp/c_proj/w":
+        print("  Transposing")
+        data = data.transpose()
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/convert-ckpt-to-ggml.py b/stable-diffusion.cpp/ggml/examples/gpt-2/convert-ckpt-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..9113141f61f018ac6592d1557255b002b5aea572
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/convert-ckpt-to-ggml.py
@@ -0,0 +1,159 @@
+# Convert a model checkpoint to a ggml compatible file
+#
+# Load the model using TensorFlow.
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# By default, the bigger matrices are converted to 16-bit floats.
+# This can be disabled by adding the "use-f32" CLI argument.
+#
+# At the start of the ggml file we write the model parameters
+# and vocabulary.
+#
+
+import sys
+import json
+import struct
+import numpy as np
+import tensorflow as tf
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+# helper method to convert a numpy array to different float types
+def convert_to_ftype(data, ftype):
+    # fp16
+    if ftype == 1:
+        return data.astype(np.float16)
+
+    assert False, "Invalid ftype: " + str(ftype)
+
+if len(sys.argv) < 3:
+    print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+
+list_vars = tf.train.list_variables(dir_model)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["n_vocab"]))
+fout.write(struct.pack("i", hparams["n_ctx"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+fout.write(struct.pack("i", ftype))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(encoder)))
+
+for key in encoder:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name, shape in list_vars:
+    print("Processing variable: " + name + " with shape: ", shape)
+
+    data = tf.train.load_variable(dir_model, name).squeeze()
+    n_dims = len(data.shape);
+
+    # for efficiency - transpose the projection matrices
+    # "model/h.*/attn/c_attn/w"
+    # "model/h.*/attn/c_proj/w"
+    # "model/h.*/mlp/c_fc/w"
+    # "model/h.*/mlp/c_proj/w"
+    if name[-14:] == "/attn/c_attn/w" or \
+       name[-14:] == "/attn/c_proj/w" or \
+       name[-11:] == "/mlp/c_fc/w" or \
+       name[-13:] == "/mlp/c_proj/w":
+        print("  Transposing")
+        data = data.transpose()
+
+    dshape = data.shape
+
+    ftype_cur = 0
+    if ftype != 0:
+        # match name:
+        #  "model/wte"
+        #  "model/h.*/attn/c_attn/w"
+        #  "model/h.*/attn/c_proj/w"
+        #  "model/h.*/mlp/c_fc/w"
+        #  "model/h.*/mlp/c_proj/w"
+        if name == "model/wte" or name[-2:] == "/w":
+            print("  Converting to " + ftype_str[ftype])
+            data = convert_to_ftype(data, ftype)
+            ftype_cur = ftype
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/convert-h5-to-ggml.py b/stable-diffusion.cpp/ggml/examples/gpt-2/convert-h5-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a2b865411d7d3ae93fcf78712ec3a9e9c957fc6
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/convert-h5-to-ggml.py
@@ -0,0 +1,195 @@
+# Convert GPT-2 h5 transformer model to ggml format
+#
+# Load the model using GPT2Model.
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# By default, the bigger matrices are converted to 16-bit floats.
+# This can be disabled by adding the "use-f32" CLI argument.
+#
+# At the start of the ggml file we write the model parameters
+# and vocabulary.
+#
+
+import sys
+import struct
+import json
+import numpy as np
+import re
+
+from transformers import GPT2Model
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+if len(sys.argv) < 2:
+    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+    encoder_added = json.load(f)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 2:
+    use_f16 = False
+    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
+
+model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
+#print (model)
+
+list_vars = model.state_dict()
+#print (list_vars)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["n_positions"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+#fout.write(struct.pack("i", hparams["rotary_dim"]))
+fout.write(struct.pack("i", use_f16))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
+
+for key in encoder:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for key in encoder_added:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # we don't need these
+    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape);
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0;
+    if use_f16:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+
+    # for efficiency - transpose these matrices:
+    #  "transformer.h.*.mlp.c_proj.weight
+    if name.endswith(".mlp.c_proj.weight"):
+        print("  Transposing")
+        data = data.transpose()
+
+    # rename headers to keep compatibility
+    if name == "ln_f.weight":
+        name = "model/ln_f/g"
+    elif name == "ln_f.bias":
+        name = "model/ln_f/b"
+    elif name == "wte.weight":
+        name = "model/wte"
+    elif name == "wpe.weight":
+        name = "model/wpe"
+    elif re.match(r"h\.\d+\.ln_1\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/g"
+    elif re.match(r"h\.\d+\.ln_1\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/b"
+    elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/w"
+    elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/b"
+    elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/w"
+    elif re.match(r"h.\d+.attn.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/b"
+    elif re.match(r"h.\d+.ln_2.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/g"
+    elif re.match(r"h.\d+.ln_2.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/b"
+    elif re.match(r"h.\d+.mlp.c_fc.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/w"
+    elif re.match(r"h.\d+.mlp.c_fc.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/b"
+    elif re.match(r"h.\d+.mlp.c_proj.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/w"
+    elif re.match(r"h.\d+.mlp.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/b"
+    else:
+        print("Unrecognized variable name. %s", name)
+
+    str = name.encode('utf-8')
+
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/download-ggml-model.sh b/stable-diffusion.cpp/ggml/examples/gpt-2/download-ggml-model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3aae015b71afb6f0c99a9161c1a4fdd71ab0d469
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/download-ggml-model.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# This script downloads GPT-2 model files that have already been converted to ggml format.
+# This way you don't have to convert them yourself.
+#
+# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.
+
+#src="https://ggml.ggerganov.com"
+#pfx="ggml-model-gpt-2"
+
+src="https://huggingface.co/ggerganov/ggml"
+pfx="resolve/main/ggml-model-gpt-2"
+
+ggml_path=$(dirname $(realpath $0))
+
+# GPT-2 models
+models=( "117M" "345M" "774M" "1558M" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ "$#" -ne 1 ]; then
+    printf "Usage: $0 <model>\n"
+    list_models
+
+    exit 1
+fi
+
+model=$1
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+# download ggml model
+
+printf "Downloading ggml model $model ...\n"
+
+mkdir -p models/gpt-2-$model
+
+if [ -x "$(command -v wget)" ]; then
+    wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
+elif [ -x "$(command -v curl)" ]; then
+    curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
+else
+    printf "Either wget or curl is required to download models.\n"
+    exit 1
+fi
+
+if [ $? -ne 0 ]; then
+    printf "Failed to download ggml model $model \n"
+    printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
+    exit 1
+fi
+
+printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
+printf "You can now use it like this:\n\n"
+printf "  $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
+printf "\n"
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/download-model.sh b/stable-diffusion.cpp/ggml/examples/gpt-2/download-model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f0c62f4f74db8fdb71dd45209d3a48bc159898b0
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/download-model.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+ggml_path=$(dirname $(realpath $0))
+
+# GPT-2 models
+models=( "117M" "345M" "774M" "1558M" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ "$#" -ne 1 ]; then
+    printf "Usage: $0 <model>\n"
+    list_models
+
+    exit 1
+fi
+
+model=$1
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+# download model
+
+printf "Downloading model $model ...\n"
+
+mkdir -p models/gpt-2-$model
+
+for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
+    wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
+done
+
+printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
+printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
+printf "\n"
+printf "  python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
+printf "\n"
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/main-batched.cpp b/stable-diffusion.cpp/ggml/examples/gpt-2/main-batched.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ba665da2da2a0c35651e61627aeb7dac5601790
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/main-batched.cpp
@@ -0,0 +1,1224 @@
+#include "ggml/ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+typedef int32_t gpt2_pos;
+typedef int32_t gpt2_seq_id;
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt2_kv_cell {
+    gpt2_pos pos   = -1;
+    gpt2_pos delta = 0;
+
+    std::set<gpt2_seq_id> seq_id;
+
+    bool has_seq_id(const gpt2_seq_id & id) const {
+        return seq_id.find(id) != seq_id.end();
+    }
+};
+
+struct gpt2_kv_cache {
+    // key + value memory
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+    //
+
+    uint32_t head = 0;
+    uint32_t size = 0;
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<gpt2_kv_cell> cells;
+
+    ggml_backend_buffer_t buffer;
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
+
+    gpt2_kv_cache kv_cache;
+
+    struct ggml_context * ctx;
+
+    ggml_backend_t backend = NULL;
+
+    ggml_backend_buffer_t buffer_w;
+
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// Input data for gpt2_decode
+// A gpt2_batch object can contain input about one or many sequences
+// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
+//
+// - token  : the token ids of the input (used when embd is NULL)
+// - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
+// - pos    : the positions of the respective token in the sequence
+// - seq_id : the sequence to which the respective token belongs
+// - logits : if zero, the logits for the respective token will not be output
+//
+struct gpt2_batch {
+    int32_t n_tokens = -1;
+
+    gpt_vocab::id  * token  = {};
+    float          * embd   = {};
+    gpt2_pos       * pos    = {};
+    gpt2_seq_id    * seq_id = {};
+    int8_t         * logits = {};
+};
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx, int n_gpu_layers) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t buffer_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        buffer_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        buffer_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        buffer_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        buffer_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        buffer_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        buffer_size += (6 + 12*n_layer)*128; // alignment overhead
+
+        printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * n_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // initialize the backend
+#ifdef GGML_USE_CUBLAS
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        model.backend = ggml_backend_cuda_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+    }
+#endif
+
+#ifdef GGML_USE_METAL
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using Metal backend\n", __func__);
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        model.backend = ggml_backend_metal_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        }
+    }
+#endif
+
+    if (!model.backend) {
+        // fallback to CPU backend
+        fprintf(stderr, "%s: using CPU backend\n", __func__);
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
+        return false;
+    }
+
+    // allocate weights buffer
+    model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        // map by name
+        model.tensors["model/ln_f/g"] = model.ln_f_g;
+        model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+        model.tensors["model/wte"]     = model.wte;
+        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+    }
+
+    // override the default training context with the user-provided
+    model.hparams.n_ctx = n_ctx;
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.kv_cache.k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        model.kv_cache.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+
+        model.kv_cache.head      = 0;
+        model.kv_cache.size      = n_ctx;
+
+        model.kv_cache.cells.resize(n_ctx);
+
+        const size_t memory_size = ggml_nbytes(model.kv_cache.k) + ggml_nbytes(model.kv_cache.v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+
+        // create a backend buffer (can be in host or device memory)
+        model.kv_cache.buffer = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
+
+        // allocate the tensors into the backend buffer
+        {
+            ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.kv_cache.buffer);
+
+            // this updates the pointers in the tensors to point to the correct location in the buffer
+            // this is necessary since the ggml_context is .no_alloc == true
+            // note that the buffer can actually be a device buffer, depending on the backend
+            ggml_allocr_alloc(alloc, model.kv_cache.k);
+            ggml_allocr_alloc(alloc, model.kv_cache.v);
+
+            ggml_allocr_free(alloc);
+        }
+    }
+
+    // load weights
+    {
+        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);
+
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+
+        std::vector<char> read_buf;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            ggml_set_name(tensor, name.c_str());
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            ggml_allocr_alloc(alloc, tensor);
+
+            if (ggml_backend_is_cpu  (model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+                ) {
+                // for the CPU and Metal backend, we can read directly into the tensor
+                fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(ggml_nbytes(tensor));
+                fin.read(read_buf.data(), ggml_nbytes(tensor));
+                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+            }
+
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                //ggml_allocr_alloc(alloc, model.lm_head);
+                //ggml_backend_tensor_copy(tensor, model.lm_head);
+                model.lm_head = tensor;
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+
+        ggml_allocr_free(alloc);
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+        const  gpt2_model  & model,
+        struct ggml_allocr * allocr,
+        const  gpt2_batch  & batch) {
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+
+    const auto & kv_cache = model.kv_cache;
+
+    const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = ggml_allocr_is_measure(allocr) ? n_ctx            : kv_cache.n;
+    const int32_t kv_head  = ggml_allocr_is_measure(allocr) ? n_ctx - n_tokens : kv_cache.head;
+
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * inpL;
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        ggml_allocr_alloc(allocr, inp_tokens);
+        if (!ggml_allocr_is_measure(allocr)) {
+            ggml_backend_tensor_set(inp_tokens, batch.token, 0, n_tokens*ggml_element_size(inp_tokens));
+        }
+
+        struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        ggml_allocr_alloc(allocr, position);
+        if (!ggml_allocr_is_measure(allocr)) {
+            for (int i = 0; i < n_tokens; ++i) {
+                int32_t v = batch.pos[i];
+                ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
+            }
+        }
+
+        // wte + wpe
+        inpL =
+            ggml_add(ctx0,
+                    ggml_get_rows(ctx0, model.wte, inp_tokens),
+                    ggml_get_rows(ctx0, model.wpe, position));
+    } else {
+        GGML_ASSERT(batch.embd);
+
+        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+
+        ggml_allocr_alloc(allocr, inpL);
+        if (!ggml_allocr_is_measure(allocr)) {
+            ggml_backend_tensor_set(inpL, batch.embd, 0, n_tokens * n_embd * ggml_element_size(inpL));
+        }
+    }
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        float s = 1.0f/sqrtf(float(n_embd)/n_head);
+        ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
+    }
+
+    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    ggml_set_name(KQ_mask, "KQ_mask");
+    ggml_allocr_alloc(allocr, KQ_mask);
+    if (!ggml_allocr_is_measure(allocr)) {
+        std::vector<float> data_buf(n_kv*n_tokens);
+        const float neg_inf_v = -INFINITY;
+
+        for (int h = 0; h < 1; ++h) {
+            int h_offset = h*(n_kv*n_tokens);
+            for (int j = 0; j < n_tokens; ++j) {
+                const gpt2_pos    pos    = batch.pos[j];
+                const gpt2_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_cache.cells[i].has_seq_id(seq_id) || kv_cache.cells[i].pos > pos) {
+                        data_buf[h_offset + j*n_kv + i] = neg_inf_v;
+                    }
+                }
+            }
+        }
+
+        ggml_backend_tensor_set(KQ_mask, data_buf.data(), 0, data_buf.size() * sizeof(float));
+    }
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        cur,
+                        model.layers[il].ln_1_g),
+                    model.layers[il].ln_1_b);
+        }
+
+        // attn
+        // [2304,        768] - model.layers[il].c_attn_attn_w
+        // [2304,          1] - model.layers[il].c_attn_attn_b
+        // [ 768,   n_tokens] - cur (in)
+        // [2304,   n_tokens] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, n_tokens]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_attn_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_attn_b);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            // store key and value to memory
+            if (n_tokens >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, n_tokens)),
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_kv).permute(0, 2, 1, 3)
+            // [64, n_kv, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
+                            n_embd/n_head, n_head, n_kv),
+                        0, 2, 1, 3);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
+            //                    n_embd/n_head, n_head, n_kv),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_kv, n_tokens, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_kv, n_tokens, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        KQ_scale);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_kv, n_tokens, 12]
+            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_kv, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_kv).permute(1, 2, 0, 3).contiguous()
+            // [n_kv, 64, 12]
+            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
+                                n_embd/n_head, n_head, n_kv),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.kv_cache.v->type, n_kv, n_embd/n_head, n_head));
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, n_tokens, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, n_tokens]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, n_tokens]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_proj_b);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            cur,
+                            model.layers[il].ln_2_g),
+                        model.layers[il].ln_2_b);
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_proj_b);
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    inpL,
+                    model.ln_f_g),
+                model.ln_f_b);
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+static void gpt2_kv_cache_seq_cp(
+        struct gpt2_kv_cache & cache,
+                 gpt2_seq_id   seq_id_src,
+                 gpt2_seq_id   seq_id_dst,
+                    gpt2_pos   p0,
+                    gpt2_pos   p1) {
+    if (p0 < 0) p0 = 0;
+    if (p1 < 0) p1 = std::numeric_limits<gpt2_pos>::max();
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            cache.cells[i].seq_id.insert(seq_id_dst);
+        }
+    }
+}
+
+struct gpt2_batch gpt2_batch_init(int32_t n_tokens, int32_t embd) {
+    gpt2_batch batch;
+
+    if (embd) {
+        batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
+    } else {
+        batch.token = (gpt_vocab::id *) malloc(sizeof(gpt_vocab::id) * n_tokens);
+    }
+
+    batch.pos    = (gpt2_pos *)    malloc(sizeof(gpt2_pos)    * n_tokens);
+    batch.seq_id = (gpt2_seq_id *) malloc(sizeof(gpt2_seq_id) * n_tokens);
+    batch.logits = (int8_t *)      malloc(sizeof(int8_t)      * n_tokens);
+
+    return batch;
+}
+
+void gpt2_batch_free(struct gpt2_batch batch) {
+    if (batch.token)  free(batch.token);
+    if (batch.embd)   free(batch.embd);
+    if (batch.pos)    free(batch.pos);
+    if (batch.seq_id) free(batch.seq_id);
+    if (batch.logits) free(batch.logits);
+}
+
+// Positive return values does not mean a fatal error, but rather a warning.
+//   0 - success
+// < 0 - error
+int gpt2_decode(
+        struct gpt2_model  & model,
+        struct ggml_allocr * allocr,
+        struct gpt2_batch    batch,
+        int                  n_threads,
+        std::vector<float> & logits) {
+    const int32_t n_tokens = batch.n_tokens;
+    const auto &  hparams  = model.hparams;
+    const int     n_vocab  = hparams.n_vocab;
+
+    if (n_tokens == 0) {
+        printf("%s: n_tokens == 0", __func__);
+        return -1;
+    }
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd));
+
+    auto & cache = model.kv_cache;
+
+    for (int i = 0; i < n_tokens; i++) {
+        cache.cells[cache.head + i].pos = batch.pos[i];
+        cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
+    }
+
+    cache.n = cache.head + n_tokens;
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+
+    struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+    // run the computation
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+    ggml_backend_graph_compute(model.backend, gf);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    // in this case, the output tensor is the last one in the graph
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    if (batch.logits) {
+        // return logits for all tokens
+        logits.resize(n_vocab*n_tokens);
+        for (int32_t i = 0; i < n_tokens; i++) {
+            if (batch.logits[i] == 0) {
+                continue;
+            }
+            ggml_backend_tensor_get(inpL, logits.data() + n_vocab*i, n_vocab*i*sizeof(float), sizeof(float)*n_vocab);
+        }
+    } else {
+        // return result just for the last token
+        logits.resize(n_vocab);
+        ggml_backend_tensor_get(inpL, logits.data(), (n_vocab*(n_tokens-1))*sizeof(float), sizeof(float)*n_vocab);
+    }
+
+    // update the kv ring buffer
+    cache.head += n_tokens;
+
+    // ensure kv cache head points to a valid index.
+    if (cache.head >= cache.size) {
+        printf("%s: cache.head >= cache.size\n", __func__);
+        return -2;
+    }
+
+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(params.model, model, vocab, params.n_ctx, params.n_gpu_layers)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    // keep this buffer alive while evaluating the model
+    ggml_backend_buffer_t buf_compute;
+
+    const int n_parallel = params.n_parallel;
+    const int n_batch_max = std::max(embd_inp.size(), (size_t)n_parallel);
+
+    // create a gpt2_batch
+    // we use this object to submit token data for decoding
+    gpt2_batch batch = gpt2_batch_init(n_batch_max, 0);
+
+    // prepare required memory and allocate the compute buffer
+    struct ggml_allocr * allocr = NULL;
+    {
+        // alignment required by the backend
+        size_t align = ggml_backend_get_alignment(model.backend);
+        allocr = ggml_allocr_new_measure(align);
+
+        batch.n_tokens = n_batch_max;
+
+        // create the worst case graph for memory usage estimation
+        struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch);
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(allocr);
+        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        allocr = ggml_allocr_new_from_buffer(buf_compute);
+
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+    }
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // evaluate the initial prompt
+    batch.n_tokens = embd_inp.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = embd_inp[i];
+        batch.pos[i]    = i;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
+    }
+
+    // gpt2_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (gpt2_decode(model, allocr, batch, params.n_threads, logits) != 0) {
+        printf("%s: gpt2_decode() failed\n", __func__);
+        return 1;
+    }
+
+    // assign the system KV cache to all parallel sequences
+    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    for (int32_t i = 1; i < n_parallel; ++i) {
+        gpt2_kv_cache_seq_cp(model.kv_cache, 0, i, 0, batch.n_tokens);
+    }
+
+    if (n_parallel > 1) {
+        printf("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+    }
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+        printf("%d ", embd_inp[i]);
+    }
+    printf("\n\n");
+
+    std::vector<gpt_vocab::token> streams(n_parallel);
+
+    // remember the batch index of the last token for each parallel sequence
+    // we need this to determine which logits to sample from
+    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+
+    int n_cur     = batch.n_tokens;
+    int n_len     = batch.n_tokens + params.n_predict;
+    int n_decoded = 0;
+
+    const int   n_vocab = model.hparams.n_vocab;
+    const int   top_k = params.top_k;
+    const float top_p = params.top_p;
+    const float temp  = params.temp;
+
+    while (n_cur < n_len) {
+        batch.n_tokens = 0;
+
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            if (i_batch[i] < 0) {
+                // the stream has already finished
+                continue;
+            }
+
+            auto * logits_i = logits.data() + i_batch[i]*n_vocab;
+
+            gpt_vocab::id id = 0;
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits_i, top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // is it an end of stream? -> mark the stream as finished
+            if ((!params.ignore_eos && id == 50256) || n_cur == n_len - 1) {
+                i_batch[i] = -1;
+                printf("\n");
+                if (n_parallel > 1) {
+                    printf("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                }
+
+                continue;
+            }
+
+            auto& token = vocab.id_to_token[id];
+            if (n_parallel == 1) {
+                printf("%s", token.c_str());
+                fflush(stdout);
+            }
+
+            streams[i] += token;
+
+            // push this new token for next evaluation
+            batch.token [batch.n_tokens] = id;
+            batch.pos   [batch.n_tokens] = n_cur;
+            batch.seq_id[batch.n_tokens] = i;
+            batch.logits[batch.n_tokens] = true;
+
+            i_batch[i] = batch.n_tokens;
+
+            batch.n_tokens += 1;
+
+            n_decoded += 1;
+        }
+
+        // all streams are finished
+        if (batch.n_tokens == 0) {
+            break;
+        }
+
+        n_cur += 1;
+
+        {
+            const int64_t t_start_us = ggml_time_us();
+
+            // evaluate the current batch with the transformer model
+            int ret_code = gpt2_decode(model, allocr, batch, params.n_threads, logits);
+            if (ret_code != 0) {
+                fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, ret_code);
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+        }
+    }
+
+    if (n_parallel > 1) {
+        printf("\n");
+
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            printf("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     n_decoded = %8d\n",      __func__, n_decoded);
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms\n", __func__, t_predict_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    gpt2_batch_free(batch);
+    ggml_free(model.ctx);
+
+    ggml_backend_buffer_free(model.buffer_w);
+    ggml_backend_buffer_free(model.kv_cache.buffer);
+    ggml_backend_buffer_free(buf_compute);
+    ggml_backend_free(model.backend);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/main.cpp b/stable-diffusion.cpp/ggml/examples/gpt-2/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..514f299a117767ce694e35b3db57d0c0fe9b68d2
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/main.cpp
@@ -0,0 +1,1002 @@
+#include "ggml/ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+
+    ggml_backend_t backend = NULL;
+
+    ggml_backend_buffer_t buffer_w;
+    ggml_backend_buffer_t buffer_kv;
+
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx, int n_gpu_layers) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t buffer_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        buffer_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        buffer_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        buffer_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        buffer_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        buffer_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        buffer_size += (6 + 12*n_layer)*128; // alignment overhead
+
+        printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * n_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // initialize the backend
+#ifdef GGML_USE_CUBLAS
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        model.backend = ggml_backend_cuda_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+    }
+#endif
+
+#ifdef GGML_USE_METAL
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using Metal backend\n", __func__);
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        model.backend = ggml_backend_metal_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        }
+    }
+#endif
+
+    if (!model.backend) {
+        // fallback to CPU backend
+        fprintf(stderr, "%s: using CPU backend\n", __func__);
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
+        return false;
+    }
+
+    // allocate weights buffer
+    model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        // map by name
+        model.tensors["model/ln_f/g"] = model.ln_f_g;
+        model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+        model.tensors["model/wte"]     = model.wte;
+        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+    }
+
+    // override the default training context with the user-provided
+    model.hparams.n_ctx = n_ctx;
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+
+        // create a backend buffer (can be in host or device memory)
+        model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
+
+        // allocate the tensors into the backend buffer
+        {
+            ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
+
+            // this updates the pointers in the tensors to point to the correct location in the buffer
+            // this is necessary since the ggml_context is .no_alloc == true
+            // note that the buffer can actually be a device buffer, depending on the backend
+            ggml_allocr_alloc(alloc, model.memory_k);
+            ggml_allocr_alloc(alloc, model.memory_v);
+
+            ggml_allocr_free(alloc);
+        }
+    }
+
+    // load weights
+    {
+        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);
+
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+
+        std::vector<char> read_buf;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            ggml_set_name(tensor, name.c_str());
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            ggml_allocr_alloc(alloc, tensor);
+
+            if (ggml_backend_is_cpu  (model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+                ) {
+                // for the CPU and Metal backend, we can read directly into the tensor
+                fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(ggml_nbytes(tensor));
+                fin.read(read_buf.data(), ggml_nbytes(tensor));
+                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+            }
+
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                //ggml_allocr_alloc(alloc, model.lm_head);
+                //ggml_backend_tensor_copy(tensor, model.lm_head);
+                model.lm_head = tensor;
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+
+        ggml_allocr_free(alloc);
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, embd);
+
+    // avoid writing to tensors if we are only measuring the memory usage
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            int32_t v = n_past + i;
+            ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
+        }
+    }
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        float s = 1.0f/sqrtf(float(n_embd)/n_head);
+        ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        cur,
+                        model.layers[il].ln_1_g),
+                    model.layers[il].ln_1_b);
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_attn_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_attn_b);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        KQ_scale);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_proj_b);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            cur,
+                            model.layers[il].ln_2_g),
+                        model.layers[il].ln_2_b);
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_proj_b);
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    inpL,
+                    model.ln_f_g),
+                model.ln_f_b);
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - allocr:    ggml_allocr to use to allocate the compute buffer
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_vocab;
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+
+    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+    // run the computation
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+    ggml_backend_graph_compute(model.backend, gf);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    // in this case, the output tensor is the last one in the graph
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    //embd_w.resize(n_vocab*N);
+    //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-2-117M/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(params.model, model, vocab, params.n_ctx, params.n_gpu_layers)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+    // keep this buffer alive while evaluating the model
+    ggml_backend_buffer_t buf_compute;
+
+    struct ggml_allocr * allocr = NULL;
+    // allocate the compute buffer
+    {
+         // alignment required by the backend
+        size_t align = ggml_backend_get_alignment(model.backend);
+        allocr = ggml_allocr_new_measure(align);
+
+        // create the worst case graph for memory usage estimation
+        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
+        int n_past = model.hparams.n_ctx - n_tokens;
+        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(allocr);
+        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        allocr = ggml_allocr_new_from_buffer(buf_compute);
+
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+        printf("%d ", embd_inp[i]);
+    }
+    printf("\n\n");
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int   top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (!params.ignore_eos && embd.back() == 50256) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    ggml_backend_buffer_free(model.buffer_w);
+    ggml_backend_buffer_free(model.buffer_kv);
+    ggml_backend_buffer_free(buf_compute);
+    ggml_backend_free(model.backend);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-2/quantize.cpp b/stable-diffusion.cpp/ggml/examples/gpt-2/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d8d53a67b1b1881da73eb7aa67074c84d866595
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-2/quantize.cpp
@@ -0,0 +1,184 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+};
+
+// quantize a model
+bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    gpt2_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        finp.read ((char *) &n_vocab, sizeof(n_vocab));
+        fout.write((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        "model/wte",
+        "model/lm_head",
+        "model/h.*/attn/c_attn/w",
+        "model/h.*/attn/c_proj/w",
+        "model/h.*/mlp/c_fc/w",
+        "model/h.*/mlp/c_proj/w",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-j/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/gpt-j/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3675b7df264ac90f2fcbd88491a1cec8b1a6b6d7
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-j/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# gpt-j
+
+set(TEST_TARGET gpt-j)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# gpt-j-quantize
+
+set(TEST_TARGET gpt-j-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-j/README.md b/stable-diffusion.cpp/ggml/examples/gpt-j/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e5cc7959ef09bbb86d2dd83a250f70120c179723
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-j/README.md
@@ -0,0 +1,246 @@
+# gpt-j
+
+Local GPT-J inference on your computer using C/C++
+
+No video card required. You just need to have 16 GB of RAM.
+
+## Motivation
+
+The GPT-J 6B model is the open-source alternative to OpenAI's GPT-3. It's basically a neural network that allows you to
+generate coherent, human-like text given a certain context (prompt).
+
+The GPT-J model is quite big - the compact version of the model uses 16-bit floating point representation of the weights
+and is still 12 GB big. This means that in order to run inference on your computer, you would need to have a video card
+with at least 12 GB of video RAM. Alternatively, you can try to run the python implementations on the CPU, but that
+would probably not be very efficient as they are primarily optimized for running on a GPU (or at least this is my guess -
+I don't have much experience with python).
+
+I wanted to try and run the model on my MacBook, so I decided to implement the model inference from scratch using my own
+custom build tensor library. The tensor library (called [ggml](https://github.com/ggerganov/ggml), written in C) is in
+early development stage, but it already allows me to run the GPT-J model.
+
+On my 32GB MacBook M1 Pro, I achieve an inference speed of about `125 ms/token` or about ~6 words per second (1 word
+typically consists of 1 or 2 tokens).
+
+Here is a sample run with prompt `int main(int argc, char ** argv) {`:
+
+```
+$ time ./bin/gpt-j -p "int main(int argc, char ** argv) {"
+
+gptj_model_load: loading model from 'models/gpt-j-6B/ggml-model.bin' - please wait ...
+gptj_model_load: n_vocab = 50400
+gptj_model_load: n_ctx   = 2048
+gptj_model_load: n_embd  = 4096
+gptj_model_load: n_head  = 16
+gptj_model_load: n_layer = 28
+gptj_model_load: n_rot   = 64
+gptj_model_load: f16     = 1
+gptj_model_load: ggml ctx size = 13334.86 MB
+gptj_model_load: memory_size =  1792.00 MB, n_mem = 57344
+gptj_model_load: ................................... done
+gptj_model_load: model size = 11542.79 MB / num tensors = 285
+main: number of tokens in prompt = 13
+
+int main(int argc, char ** argv) {
+    (void)argc;
+    (void)argv;
+
+    {
+        struct sockaddr_in addr;
+        int addrlen;
+        char * ip = "192.168.1.4";
+        int i;
+
+        if ( (addrlen = sizeof(addr)) == -1 )
+            return -1;
+
+        for (i = 0; i < 10; ++i) {
+            addr.sin_family = AF_INET;
+            addr.sin_addr.s_addr = inet_addr(ip);
+
+main: mem per token = 16430420 bytes
+main:     load time =  6211.48 ms
+main:   sample time =    13.74 ms
+main:  predict time = 26420.34 ms / 124.62 ms per token
+main:    total time = 33035.37 ms
+
+real	0m33.171s
+user	3m32.269s
+sys      0m3.686s
+
+$
+```
+
+It took ~6.2 seconds to load the model to memory. After that, it took ~26.4 seconds to generate 200 tokens of what
+looks like to be the beginning of a networking program in C. Pretty cool!
+
+Here is another run, just for fun:
+
+```
+time ./bin/gpt-j -n 500 -t 8 -p "Ask HN: Inherited the worst code and tech team I have ever seen. How to fix it?
+"
+
+gptj_model_load: loading model from 'models/gpt-j-6B/ggml-model.bin' - please wait ...
+gptj_model_load: n_vocab = 50400
+gptj_model_load: n_ctx   = 2048
+gptj_model_load: n_embd  = 4096
+gptj_model_load: n_head  = 16
+gptj_model_load: n_layer = 28
+gptj_model_load: n_rot   = 64
+gptj_model_load: f16     = 1
+gptj_model_load: ggml ctx size = 13334.86 MB
+gptj_model_load: memory_size =  1792.00 MB, n_mem = 57344
+gptj_model_load: ................................... done
+gptj_model_load: model size = 11542.79 MB / num tensors = 285
+main: number of tokens in prompt = 24
+
+Ask HN: Inherited the worst code and tech team I have ever seen. How to fix it?
+
+I've inherited a team with some very strange and un-documented practices, one of them is that they use an old custom
+application with a very slow tech stack written in Python that the team doesn't want to touch but also doesn't want to
+throw away as it has some "legacy" code in it.
+
+The problem is, the tech stack is very very slow.
+
+They have a single web server on a VM that is slow.
+The server is a little bit busy (not very busy though) and they have a lot of processes (30+ that are constantly being
+spawned by the application)
+They have an application that is single threaded and was written in Python and the team don't want to touch this, and
+the application is very slow.
+
+My task as a new member of the team is to fix this.
+
+I'm a senior dev on the team (3 years on the project) and have been told that I will take the lead on this task. I know
+next to nothing about Python. So here is what I have so far.
+
+What I have done is I've been trying to debug the processes with the "ps" command. This way I can see what is running
+and where. From what I see, the application spawns 10 processes a minute and some of them are used for nothing.
+
+I have also started to look for the code. The application source is not in GitHub or any other repository, it is only on
+our internal GitLab.
+
+What I've found so far:
+
+The application uses a custom SQLAlchemy implementation to interact with the data. I've looked at the source, it looks
+like an object cache or something like that. But from what I've seen, the cache gets full every 20 minutes and then gets
+cleared with a special command.
+
+Another strange thing is that the application creates a file for every entry in the database (even if the entry already
+exists). I've looked at the file to see if it contains something, but it seems to be a JSON file with lots of records.
+
+The other strange thing is that I can only find the database tables in the GitLab repository and not the code. So I
+can't really understand how the application is supposed to interact with the database.
+
+I also found a "log" directory, but the code is encrypted with AES. From what I've found, it is in
+
+main: mem per token = 16430420 bytes
+main:     load time =  3900.10 ms
+main:   sample time =    32.58 ms
+main:  predict time = 68049.91 ms / 130.11 ms per token
+main:    total time = 73020.05 ms
+
+real	1m13.156s
+user	9m1.328s
+sys.    0m7.103s
+```
+
+## Implementation details
+
+The high level implementation of the model is contained in the [main.cpp](main.cpp) file. The core computations are
+performed by the [ggml](https://github.com/ggerganov/ggml/blob/master/include/ggml/ggml.h) library.
+
+
+#### Matrix multiplication
+
+The most performance critical part of the implementation is of course the matrix multiplication routine. 99% of the time
+is spent here, so it was important to optimize this as much as possible.
+
+On Arm64, I utilize the 128-bit NEON intrinsics for 16-bit floating point operations:
+
+https://github.com/ggerganov/ggml/blob/fb558f78d905f85c54813602649ddd628ffe0f3a/src/ggml.c#L187-L243
+
+These instructions allow each core to operate simultaneously on 64 16-bit floats. I'm no expert in SIMD, but after quite
+some trials this was the most efficient code for dot product of a row and column that I could come up with. Combined
+with the parallel computation on 8 CPU threads, I believe I'm close to the maximum performance that one could possibly
+get on the M1 CPU. Still, I'm curious to know if there is a more efficient way to implement this.
+
+
+#### Attempt to use the M1 GPU
+
+One interesting property of the GPT-J transformer architecture is that it allows you to perform part of the inference in
+parallel - i.e. the Feed-forward network can be computed in parallel to the Self-attention layer:
+
+https://github.com/ggerganov/ggml/blob/fb558f78d905f85c54813602649ddd628ffe0f3a/examples/gpt-j/main.cpp#L507-L531
+
+So I thought why not try and bring in the M1 GPU to compute half of the neural network in parallel to the CPU and
+potentially gain some extra performance. Thanks to the M1's shared memory model, it was relatively easy to offload part
+of the computation to the GPU using Apple's [Metal Performance
+Shaders](https://developer.apple.com/documentation/metalperformanceshaders). The GPU shares the host memory, so there is
+no need to copy the data back and forth as you would normally do with Cuda or OpenCL. The weight matrices are directly
+available to be used by the GPU.
+
+However, to my surprise, using MPS together with the CPU did not lead to any performance improvement at all. My
+conclusion was that the 8-thread NEON CPU computation is already saturating the memory bandwidth of the M1 and since
+the CPU and the GPU on the MacBook are sharing that bandwidth, it does not help to offload the computation to the GPU.
+Another observation was that the MPS GPU matrix multiplication using 16-bit floats had the same performance as the
+8-thread NEON CPU implementation. Again, I explain this with a saturated memory channel. But of course, my explanation
+could be totally wrong and somehow the implementation wasn't utilizing the resources correctly.
+
+In the end, I decided to not use MPS or the GPU all together.
+
+### Zero memory allocations
+
+Another property of my implementation is that it does not perform any memory allocations once the model is loaded into
+memory. All required memory is allocated at the start of the program with a single `malloc` (technically 2 calls, but
+that is not important).
+
+## Usage
+
+If you want to give this a try and you are on Linux or Mac OS, simply follow these instructions:
+
+```bash
+# Clone the ggml library and build the gpt-j example
+git clone https://github.com/ggerganov/ggml
+cd ggml
+mkdir build && cd build
+cmake ..
+make -j4 gpt-j
+
+# Download the ggml-compatible GPT-J 6B model (requires 12GB disk space)
+../examples/gpt-j/download-ggml-model.sh 6B
+
+# Run the inference (requires 16GB of CPU RAM)
+./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin -p "This is an example"
+
+# Input prompt through pipe and run the inference.
+echo "This is an example" > prompt.txt
+cat prompt.txt | ./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin
+```
+
+To run the `gpt-j` tool, you need the 12GB `ggml-model.bin` file which contains the GPT-J model in
+[ggml](https://github.com/ggerganov/ggml) compatible format. In the instructions above, the binary file
+is downloaded from my repository on Hugging Face using the [download-ggml-model.sh](download-ggml-model.sh) script.
+You can also, download the file manually from this link:
+
+https://huggingface.co/ggerganov/ggml/tree/main
+
+---
+
+Alternatively, if you don't want to download the 12GB ggml model file, you can perform the conversion yourself using
+python.
+
+First, you need to download the full GPT-J model from here: https://huggingface.co/EleutherAI/gpt-j-6B
+
+Note that the full model is quite big - about 72 GB. After you download it, you need to convert it to ggml format using
+the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script. This will generate the `ggml-model.bin` file, which you can
+then use with the `gpt-j` program.
+
+
+## GPT-2
+
+I also implemented a tool for CPU inference using the smaller GPT-2 models. They have worse quality compared to GPT-J,
+but are much faster to execute.
+
+For example, the Small GPT-2 model is only 240 MB big and the inference speed on my MacBook is about 200 tokens/sec.
+
+For more details, checkout the GPT-2 example here: [gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-j/convert-h5-to-ggml.py b/stable-diffusion.cpp/ggml/examples/gpt-j/convert-h5-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7731720e8625c18a78fc20de4c9d11bc22a6f0
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-j/convert-h5-to-ggml.py
@@ -0,0 +1,173 @@
+# Convert GPT-J-6B h5 transformer model to ggml format
+#
+# Load the model using GPTJForCausalLM.
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# By default, the bigger matrices are converted to 16-bit floats.
+# This can be disabled by adding the "use-f32" CLI argument.
+#
+# At the start of the ggml file we write the model parameters
+# and vocabulary.
+#
+
+import sys
+import struct
+import json
+import torch
+import numpy as np
+
+from transformers import GPTJForCausalLM
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+    encoder_added = json.load(f)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+
+
+model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
+#print (model)
+
+list_vars = model.state_dict()
+#print (list_vars)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["n_positions"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+fout.write(struct.pack("i", hparams["rotary_dim"]))
+fout.write(struct.pack("i", ftype))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
+
+for key in encoder:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for key in encoder_added:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # we don't need these
+    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape);
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0;
+    if ftype != 0:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    # for efficiency - transpose these matrices:
+    # (note - with latest ggml this is no longer more efficient, so disabling it)
+    #  "transformer.h.*.mlp.fc_in.weight"
+    #  "transformer.h.*.attn.out_proj.weight"
+    #  "transformer.h.*.attn.q_proj.weight"
+    #  "transformer.h.*.attn.k_proj.weight"
+    #  "transformer.h.*.attn.v_proj.weight"
+    #if name.endswith(".mlp.fc_in.weight")     or \
+    #   name.endswith(".attn.out_proj.weight") or \
+    #   name.endswith(".attn.q_proj.weight")   or \
+    #   name.endswith(".attn.k_proj.weight")   or \
+    #   name.endswith(".attn.v_proj.weight"):
+    #    print("  Transposing")
+    #    data = data.transpose()
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-j/download-ggml-model.sh b/stable-diffusion.cpp/ggml/examples/gpt-j/download-ggml-model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a9e2aa517d837c306da5079e7a63ff8e46e71678
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-j/download-ggml-model.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# This script downloads GPT-J model files that have already been converted to ggml format.
+# This way you don't have to convert them yourself.
+#
+# If you want to download the original GPT-J model files, use the "download-model.sh" script instead.
+
+#src="https://ggml.ggerganov.com"
+#pfx="ggml-model-gpt-j"
+
+src="https://huggingface.co/ggerganov/ggml"
+pfx="resolve/main/ggml-model-gpt-j"
+
+ggml_path=$(dirname $(realpath $0))
+
+# GPT-J models
+models=( "6B" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ "$#" -ne 1 ]; then
+    printf "Usage: $0 <model>\n"
+    list_models
+
+    exit 1
+fi
+
+model=$1
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+# download ggml model
+
+printf "Downloading ggml model $model ...\n"
+
+mkdir -p models/gpt-j-$model
+
+if [ -x "$(command -v wget)" ]; then
+    wget --quiet --show-progress -O models/gpt-j-$model/ggml-model.bin $src/$pfx-$model.bin
+elif [ -x "$(command -v curl)" ]; then
+    curl -L --output models/gpt-j-$model/ggml-model.bin $src/$pfx-$model.bin
+else
+    printf "Either wget or curl is required to download models.\n"
+    exit 1
+fi
+
+if [ $? -ne 0 ]; then
+    printf "Failed to download ggml model $model \n"
+    printf "Please try again later or download the original GPT-J model files and convert them yourself.\n"
+    exit 1
+fi
+
+printf "Done! Model '$model' saved in 'models/gpt-j-$model/ggml-model.bin'\n"
+printf "You can now use it like this:\n\n"
+printf "  $ ./bin/gpt-j -m models/gpt-j-$model/ggml-model.bin -p \"This is an example\"\n"
+printf "\n"
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-j/download-model.sh b/stable-diffusion.cpp/ggml/examples/gpt-j/download-model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c773baf40859f63fb86f7add90df2df914c2acb1
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-j/download-model.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+printf "To obtain the GPT-J 6B model files, please visit: https://huggingface.co/EleutherAI/gpt-j-6B\n\n"
+
+printf "The model is very big. For example, the reposirory above is 72GB in size.\n"
+printf "If you are sure that you want to clone it, simply run the following command:\n\n"
+
+printf " $ git clone https://huggingface.co/EleutherAI/gpt-j-6B models/gpt-j-6B\n\n"
+
+printf "Alternatively, use the 'download-ggml-model.sh' script to download a 12GB ggml version of the model.\n"
+printf "This version is enough to run inference using the ggml library.\n\n"
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-j/main.cpp b/stable-diffusion.cpp/ggml/examples/gpt-j/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2c4c1fef9067c66adec5b95d038f6c1c3c31729
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-j/main.cpp
@@ -0,0 +1,755 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+
+// default hparams (GPT-J 6B)
+struct gptj_hparams {
+    int32_t n_vocab = 50400;
+    int32_t n_ctx   = 2048;
+    int32_t n_embd  = 4096;
+    int32_t n_head  = 16;
+    int32_t n_layer = 28;
+    int32_t n_rot   = 64;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gptj_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    // attention
+    struct ggml_tensor * c_attn_q_proj_w;
+    struct ggml_tensor * c_attn_k_proj_w;
+    struct ggml_tensor * c_attn_v_proj_w;
+
+    struct ggml_tensor * c_attn_proj_w;
+
+    // ff
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gptj_model {
+    gptj_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte; // position embedding
+
+    struct ggml_tensor * lmh_g; // language model head
+    struct ggml_tensor * lmh_b; // language model bias
+
+    std::vector<gptj_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// load the model's weights from a file
+bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);         // lmh_g
+        ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+
+        ctx_size += (5 + 10*n_layer)*512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
+
+        // map by name
+        model.tensors["transformer.wte.weight"] = model.wte;
+
+        model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
+        model.tensors["transformer.ln_f.bias"]   = model.ln_f_b;
+
+        model.tensors["lm_head.weight"] = model.lmh_g;
+        model.tensors["lm_head.bias"]   = model.lmh_b;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+
+            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+
+            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"]          = layer.ln_1_g;
+            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"]            = layer.ln_1_b;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"]   = layer.c_attn_q_proj_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"]   = layer.c_attn_k_proj_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"]   = layer.c_attn_v_proj_w;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"]     = layer.c_mlp_fc_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"]       = layer.c_mlp_fc_b;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"]    = layer.c_mlp_proj_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"]      = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.c_str(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+// The GPT-J model requires about 16MB of memory per input token.
+//
+bool gptj_eval(
+        const gptj_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+    const int n_rot   = hparams.n_rot;
+
+    static size_t buf_size = 256u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    int * data = (int *) KQ_pos->data;
+    for (int i = 0; i < N; ++i) {
+        data[i] = n_past + i;
+    }
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    // wte
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_1_g*cur + ln_1_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        struct ggml_tensor * inpSA = cur;
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+
+            // store key and value to memory
+            {
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection (no bias)
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+        }
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        // this is independent of the self-attention result, so it could be done in parallel to the self-attention
+        {
+            // note here we pass inpSA instead of cur
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    inpSA);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+                    cur);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // cur = proj_w*cur + proj_b
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+                    cur);
+        }
+
+        // self-attention + FF
+        cur  = ggml_add(ctx0, cur, inpFF);
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpL);
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    // lm_head
+    {
+        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+
+        inpL = ggml_add(ctx0,
+                ggml_repeat(ctx0, model.lmh_b, inpL),
+                inpL);
+    }
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result for just the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-j-6B/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gptj_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gptj_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    printf("\n");
+
+    std::vector<gpt_vocab::id> embd;
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+    gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int   top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) > params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 50256) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-j/quantize.cpp b/stable-diffusion.cpp/ggml/examples/gpt-j/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..437053b7d86b6bfe30d34a4810235ed837f45e43
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-j/quantize.cpp
@@ -0,0 +1,182 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (GPT-J 6B)
+struct gptj_hparams {
+    int32_t n_vocab = 50400;
+    int32_t n_ctx   = 2048;
+    int32_t n_embd  = 4096;
+    int32_t n_head  = 16;
+    int32_t n_layer = 28;
+    int32_t n_rot   = 64;
+    int32_t ftype   = 1;
+};
+
+// quantize a model
+bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    gptj_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        finp.read ((char *) &n_vocab, sizeof(n_vocab));
+        fout.write((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        ".*weight",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-neox/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/gpt-neox/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..21a319b33c215de48c872335ec79bd7a09130889
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-neox/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# gpt-neox
+
+set(TEST_TARGET gpt-neox)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# gpt-neox-quantize
+
+set(TEST_TARGET gpt-neox-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-neox/README.md b/stable-diffusion.cpp/ggml/examples/gpt-neox/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..64c6d7c623f5d9f0708c655163f26a04647eca97
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-neox/README.md
@@ -0,0 +1,110 @@
+# GPT-NeoX
+
+Transformer architecture: GPT-NeoX
+
+Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
+
+## Usage
+
+```bash
+# get the repo and build it
+git clone https://github.com/ggerganov/ggml
+cd ggml
+mkdir build && cd build
+cmake ..
+make -j
+
+# get the StableLM 3B Alpha model
+git clone https://huggingface.co/stabilityai/gpt_neox-base-alpha-3b
+
+# install Python dependencies
+python3 -m pip install -r ../requirements.txt
+
+# convert model to FP16
+python3 ../examples/gpt-neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1
+
+# run inference using FP16 precision
+make -j && ./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
+
+main: seed = 1681940611
+gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-f16.bin' - please wait ...
+gpt_neox_model_load: n_vocab = 50688
+gpt_neox_model_load: n_ctx   = 4096
+gpt_neox_model_load: n_embd  = 4096
+gpt_neox_model_load: n_head  = 32
+gpt_neox_model_load: n_layer = 16
+gpt_neox_model_load: n_rot   = 32
+gpt_neox_model_load: ftype   = 1
+gpt_neox_model_load: ggml ctx size = 10011.10 MB
+gpt_neox_model_load: memory_size =  2048.00 MB, n_mem = 65536
+gpt_neox_model_load: ................................ done
+gpt_neox_model_load: model size =  6939.28 MB / num tensors = 260
+main: number of tokens in prompt = 7
+main: token[0] =     42, I
+main: token[1] =   2868,  believe
+main: token[2] =    253,  the
+main: token[3] =   4495,  meaning
+main: token[4] =    273,  of
+main: token[5] =   1495,  life
+main: token[6] =    310,  is
+
+I believe the meaning of life is to grow, to find a way, to love, to find an appreciation for life, and to live it with all of its beauty.
+
+For I am the child of God. I am the offspring of God's love. I am the offspring of the light of the world. I am the offspring of the
+
+main: mem per token = 12186760 bytes
+main:     load time =  2118.55 ms
+main:   sample time =     9.59 ms
+main:  predict time =  4474.07 ms / 63.92 ms per token
+main:    total time =  6911.26 ms
+```
+
+## 5-bit integer quantization mode
+
+```bash
+# quantize the model to 5-bits using Q5_0 quantization
+./bin/gpt-neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0
+
+# run the quantized model
+./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64
+
+main: seed = 1682021489
+gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-q5_0.bin' - please wait ...
+gpt_neox_model_load: n_vocab = 50688
+gpt_neox_model_load: n_ctx   = 4096
+gpt_neox_model_load: n_embd  = 4096
+gpt_neox_model_load: n_head  = 32
+gpt_neox_model_load: n_layer = 16
+gpt_neox_model_load: n_rot   = 32
+gpt_neox_model_load: ftype   = 6
+gpt_neox_model_load: ggml ctx size = 5676.10 MB
+gpt_neox_model_load: memory_size =  1024.00 MB, n_mem = 65536
+gpt_neox_model_load: ........................ done
+gpt_neox_model_load: model size =  2604.28 MB / num tensors = 196
+main: number of tokens in prompt = 7
+main: token[0] =     42, I
+main: token[1] =   2868,  believe
+main: token[2] =    253,  the
+main: token[3] =   4495,  meaning
+main: token[4] =    273,  of
+main: token[5] =   1495,  life
+main: token[6] =    310,  is
+
+I believe the meaning of life is to love and be loved. The last three verses were enough to tie us all together. If you love someone you love them all. There are some things in this world that are just not equal in Heaven. - Be here in this moment.
+
+This world is not what is outside of us. It is what
+
+main: mem per token = 12958024 bytes
+main:     load time =   850.51 ms
+main:   sample time =     9.95 ms
+main:  predict time =  3103.81 ms / 44.34 ms per token
+main:    total time =  4177.68 ms
+
+```
+
+## Notes
+
+- No guarantees for correctness
+- The tokenizer is currently hacked - probably works only for English
+- Non-parallel residual is not supported
+- Contributions and improvements are welcome
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-neox/convert-h5-to-ggml.py b/stable-diffusion.cpp/ggml/examples/gpt-neox/convert-h5-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..f11a4cbc44def72050be6c5f651cca090e1e1420
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-neox/convert-h5-to-ggml.py
@@ -0,0 +1,107 @@
+import sys
+import struct
+import json
+import numpy as np
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+
+
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
+
+list_vars = model.state_dict()
+for name in list_vars.keys():
+    print(name, list_vars[name].shape, list_vars[name].dtype)
+
+fout = open(fname_out, "wb")
+
+print(hparams)
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+fout.write(struct.pack("i", hparams["hidden_size"]))
+fout.write(struct.pack("i", hparams["num_attention_heads"]))
+fout.write(struct.pack("i", hparams["num_hidden_layers"]))
+fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
+fout.write(struct.pack("i", hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True))
+fout.write(struct.pack("i", ftype))
+
+# TODO: temporary hack to not deal with implementing the tokenizer
+for i in range(hparams["vocab_size"]):
+    text = tokenizer.decode([i]).encode('utf-8')
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # we don't need these
+    if name.endswith(".attention.masked_bias") or     \
+       name.endswith(".attention.bias") or \
+       name.endswith(".attention.rotary_emb.inv_freq"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if ftype != 0:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str)
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-neox/main.cpp b/stable-diffusion.cpp/ggml/examples/gpt-neox/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43e16ce3e9ed8087c61615ea6ba23e7924a063a3
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-neox/main.cpp
@@ -0,0 +1,821 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <cinttypes>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// default hparams (StableLM 3B)
+struct gpt_neox_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 4096;
+    int32_t n_embd  = 4096;
+    int32_t n_head  = 32;
+    int32_t n_layer = 16;
+    int32_t n_rot   = 32; // rotary_pct * (n_embd / n_head)
+    int32_t par_res = 1; // 1 = true, 0 = false
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt_neox_layer {
+    // pre normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // post normalization
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // ff
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt_neox_model {
+    gpt_neox_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte; // position embedding
+
+    struct ggml_tensor * lmh_g; // language model head
+    //struct ggml_tensor * lmh_b; // language model bias
+
+    std::vector<gpt_neox_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// load the model's weights from a file
+bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        printf("%s: par_res = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const size_t n_embd  = hparams.n_embd;
+        const size_t n_layer = hparams.n_layer;
+        const size_t n_ctx   = hparams.n_ctx;
+        const size_t n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);           // lmh_g
+        //ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 16*n_layer)*1024; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
+
+        // map by name
+        model.tensors["gpt_neox.embed_in.weight"] = model.wte;
+
+        model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
+        model.tensors["gpt_neox.final_layer_norm.bias"]   = model.ln_f_b;
+
+        model.tensors["embed_out.weight"] = model.lmh_g;
+        //model.tensors["lm_head.bias"]   = model.lmh_b;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"]   = layer.ln_1_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"]   = layer.c_attn_attn_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"]   = layer.c_attn_proj_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"]   = layer.ln_2_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"]   = layer.c_mlp_fc_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"]   = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int64_t n_mem      = n_layer*n_ctx;
+        const int64_t n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+
+// feed-forward network
+ggml_tensor * gpt_neox_ff(
+        const gpt_neox_layer & layer,
+        ggml_context         * ctx0,
+        ggml_tensor          * inp,
+        float                  eps) {
+    ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
+
+    cur = ggml_add(ctx0,
+        ggml_mul(ctx0,
+            ggml_repeat(ctx0, layer.ln_2_g, cur),
+            cur),
+        ggml_repeat(ctx0, layer.ln_2_b, cur));
+
+    cur = ggml_mul_mat(ctx0,
+            layer.c_mlp_fc_w,
+            cur);
+
+    cur = ggml_add(ctx0,
+            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
+            cur);
+
+    // GELU activation
+    cur = ggml_gelu(ctx0, cur);
+
+    // projection
+    // cur = proj_w*cur + proj_b
+    cur = ggml_mul_mat(ctx0,
+            layer.c_mlp_proj_w,
+            cur);
+
+    cur = ggml_add(ctx0,
+            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
+            cur);
+    return cur;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt_neox_eval(
+        const gpt_neox_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+    const int n_rot   = hparams.n_rot;
+
+    static size_t buf_size = 256u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    // use 2 scratch buffers
+    // TODO: very hacky solution - reimplement in a more elegant way
+    static size_t scr0_size = 256u*1024*1024;
+    static void * scr0 = malloc(scr0_size);
+
+    static size_t scr1_size = 256u*1024*1024;
+    static void * scr1 = malloc(scr1_size);
+
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    int * data = (int *) KQ_pos->data;
+    for (int i = 0; i < N; ++i) {
+        data[i] = n_past + i;
+    }
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    // wte
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
+        // self-attention
+        {
+            {
+                cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                            cur),
+                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+            }
+
+            // compute QKV
+            {
+                cur = ggml_mul_mat(ctx0,
+                        model.layers[il].c_attn_attn_w,
+                        cur);
+
+                cur = ggml_add(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                        cur);
+            }
+
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
+
+            // using mode = 2 for GPT-NeoX mode
+            Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0);
+            Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0);
+
+            // store key and value to memory
+            {
+                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection
+            {
+                cur = ggml_mul_mat(ctx0,
+                        model.layers[il].c_attn_proj_w,
+                        cur);
+
+                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
+            }
+        }
+
+        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+
+        if (hparams.par_res == 0) {
+            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
+
+            cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
+
+            // input for next layer
+            inpL = ggml_add(ctx0, cur, inpFF);
+        } else {
+            struct ggml_tensor * inpFF = cur;
+
+            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
+            // note here we pass inpL instead of cur
+            cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
+
+            // layer input + FF
+            cur  = ggml_add(ctx0, cur, inpFF);
+
+            // input for next layer
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
+    }
+
+    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    // lm_head
+    {
+        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+
+        //inpL = ggml_add(ctx0,
+        //        ggml_repeat(ctx0, model.lmh_b, inpL),
+        //        inpL);
+    }
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result for just the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/stablelm-base-alpha-3b/ggml-model-f16.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt_neox_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt_neox_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    for (size_t i = 0; i < embd_inp.size(); i++) {
+        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+    }
+    printf("\n");
+
+    std::vector<gpt_vocab::id> embd;
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+    gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int   top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) > params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 0) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/gpt-neox/quantize.cpp b/stable-diffusion.cpp/ggml/examples/gpt-neox/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96208c1e89f6b810c5fa2836fbe4be7e66c061db
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/gpt-neox/quantize.cpp
@@ -0,0 +1,178 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (StableLM 3B)
+struct gpt_neox_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 4096;
+    int32_t n_embd  = 4096;
+    int32_t n_head  = 32;
+    int32_t n_layer = 16;
+    int32_t n_rot   = 32; // 0.25 * (n_embd / n_head)
+    int32_t par_res = 1; // 1 = true, 0 = false
+    int32_t ftype   = 1;
+};
+
+// quantize a model
+bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    gpt_neox_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = hparams.n_vocab;
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        ".*weight",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/mnist/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d9b93edc36dd84997340df3c9f49cf83cbbe9bc
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/CMakeLists.txt
@@ -0,0 +1,40 @@
+#
+# mnist
+
+set(TEST_TARGET mnist)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+
+#
+# mnist-cnn
+
+set(TEST_TARGET mnist-cnn)
+add_executable(${TEST_TARGET} main-cnn.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+
+#
+# mnist-cpu
+
+set(TEST_TARGET mnist-cpu)
+add_executable(${TEST_TARGET} main-cpu.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+
+if (APPLE)
+    #
+    # mnist-mtl
+
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(TEST_TARGET mnist-mtl)
+    add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
+    target_link_libraries(${TEST_TARGET} PRIVATE
+        ggml
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+    )
+endif()
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/README.md b/stable-diffusion.cpp/ggml/examples/mnist/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f6b66f74304fdb728eddce3b617f709f3566d298
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/README.md
@@ -0,0 +1,128 @@
+# MNIST Examples for GGML
+
+These are simple examples of how to use GGML for inferencing.
+The first example uses convolutional neural network (CNN), the second one uses fully connected neural network.
+
+## Building the examples
+
+```bash
+git clone https://github.com/ggerganov/ggml
+cd ggml
+mkdir build && cd build
+cmake ..
+make -j4 mnist-cnn mnist
+```
+
+## MNIST with CNN
+
+This implementation achieves ~99% accuracy on the MNIST test set.
+
+### Training the model
+
+Use the `mnist-cnn.py` script to train the model and convert it to GGUF format:
+
+```
+$ python3 ../examples/mnist/mnist-cnn.py train mnist-cnn-model
+...
+Keras model saved to 'mnist-cnn-model'
+```
+
+Convert the model to GGUF format:
+
+```
+$ python3 ../examples/mnist/mnist-cnn.py convert mnist-cnn-model
+...
+Model converted and saved to 'mnist-cnn-model.gguf'
+```
+
+### Running the example
+
+```bash
+$ ./bin/mnist-cnn mnist-cnn-model.gguf ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+main: loaded model in     5.17 ms
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ * * * * * _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ * * * * * * * * _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ * * * * * _ _ _ * * _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ * * * * * _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ * * * * * * * * * _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ * * * * * * * * * * _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ * * * * * * _ _ * * * _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ * * * _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ * * * * * * * * * * _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ * * * * * * _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
+
+ggml_graph_dump_dot: dot -Tpng mnist-cnn.dot -o mnist-cnn.dot.png && open mnist-cnn.dot.png
+main: predicted digit is 8
+```
+
+Computation graph:
+
+![mnist dot](https://user-images.githubusercontent.com/1991296/263763842-3b679b45-7ca1-4ee9-b19a-82e34396624f.png)
+
+## MNIST with fully connected network
+
+A fully connected layer + relu, followed by a fully connected layer + softmax.
+
+### Training the Model
+
+A Google Colab notebook for training a simple two-layer network to recognize digits is located here. You can
+use this to save a pytorch model to be converted to ggml format.
+
+[Colab](https://colab.research.google.com/drive/12n_8VNJnolBnX5dVS0HNWubnOjyEaFSb?usp=sharing)
+
+GGML "format" is whatever you choose for efficient loading. In our case, we just save the hyperparameters used
+plus the model weights and biases. Run convert-h5-to-ggml.py to convert your pytorch model. The output format is:
+
+- magic constant (int32)
+- repeated list of tensors
+- number of dimensions of tensor (int32)
+- tensor dimension (int32 repeated)
+- values of tensor (int32)
+
+Run ```convert-h5-to-ggml.py mnist_model.state_dict``` where `mnist_model.state_dict` is the saved pytorch model from the Google Colab. For
+quickstart, it is included in the mnist/models directory.
+
+```bash
+mkdir -p models/mnist
+python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
+```
+
+### Running the example
+
+```bash
+./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+```
+
+Computation graph:
+
+![mnist dot](https://user-images.githubusercontent.com/1991296/231882071-84e29d53-b226-4d73-bdc2-5bd6dcb7efd1.png)
+
+
+## Web demo
+
+The example can be compiled with Emscripten like this:
+
+```bash
+cd examples/mnist
+emcc -I../../include -I../../include/ggml -I../../examples ../../src/ggml.c main.cpp -o web/mnist.js -s EXPORTED_FUNCTIONS='["_wasm_eval","_wasm_random_digit","_malloc","_free"]' -s EXPORTED_RUNTIME_METHODS='["ccall"]' -s ALLOW_MEMORY_GROWTH=1 --preload-file models/mnist
+```
+
+Online demo: https://mnist.ggerganov.com
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/convert-h5-to-ggml.py b/stable-diffusion.cpp/ggml/examples/mnist/convert-h5-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f75365409b61d88657100ffe4c019e28117a3c
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/convert-h5-to-ggml.py
@@ -0,0 +1,63 @@
+# Convert MNIS h5 transformer model to ggml format
+#
+# Load the (state_dict) saved model using PyTorch
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# At the start of the ggml file we write the model parameters
+
+import sys
+import struct
+import json
+import numpy as np
+import re
+
+
+import torch
+import torch.nn as nn
+import torchvision.datasets as dsets
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+
+if len(sys.argv) != 2:
+    print("Usage: convert-h5-to-ggml.py model\n")
+    sys.exit(1)
+
+state_dict_file = sys.argv[1]
+fname_out = "models/mnist/ggml-model-f32.bin"
+
+state_dict = torch.load(state_dict_file, map_location=torch.device('cpu'))
+#print (model)
+
+list_vars = state_dict
+print (list_vars)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape) 
+    n_dims = len(data.shape);
+   
+    fout.write(struct.pack("i", n_dims))
+    
+    data = data.astype(np.float32)
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/main-cnn.cpp b/stable-diffusion.cpp/ggml/examples/mnist/main-cnn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7949e9a6069768036ce7c0b419b7465288898936
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/main-cnn.cpp
@@ -0,0 +1,169 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+struct mnist_model {
+    struct ggml_tensor * conv2d_1_kernel;
+    struct ggml_tensor * conv2d_1_bias;
+    struct ggml_tensor * conv2d_2_kernel;
+    struct ggml_tensor * conv2d_2_bias;
+    struct ggml_tensor * dense_weight;
+    struct ggml_tensor * dense_bias;
+    struct ggml_context * ctx;
+};
+
+bool mnist_model_load(const std::string & fname, mnist_model & model) {
+    struct gguf_init_params params = {
+        /*.no_alloc   =*/ false,
+        /*.ctx        =*/ &model.ctx,
+    };
+    gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+    if (!ctx) {
+        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
+        return false;
+    }
+    model.conv2d_1_kernel = ggml_get_tensor(model.ctx, "kernel1");
+    model.conv2d_1_bias = ggml_get_tensor(model.ctx, "bias1");
+    model.conv2d_2_kernel = ggml_get_tensor(model.ctx, "kernel2");
+    model.conv2d_2_bias = ggml_get_tensor(model.ctx, "bias2");
+    model.dense_weight = ggml_get_tensor(model.ctx, "dense_w");
+    model.dense_bias = ggml_get_tensor(model.ctx, "dense_b");
+    return true;
+}
+
+int mnist_eval(
+        const mnist_model & model,
+        const int n_threads,
+        std::vector<float> digit,
+        const char * fname_cgraph
+        )
+{
+    static size_t buf_size = 100000 * sizeof(float) * 4;
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    struct ggml_tensor * input = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 28, 28, 1, 1);
+    memcpy(input->data, digit.data(), ggml_nbytes(input));
+    ggml_set_name(input, "input");
+    ggml_tensor * cur = ggml_conv_2d(ctx0, model.conv2d_1_kernel, input, 1, 1, 0, 0, 1, 1);
+    cur = ggml_add(ctx0, cur, model.conv2d_1_bias);
+    cur = ggml_relu(ctx0, cur);
+    // Output shape after Conv2D: (26 26 32 1)
+    cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
+    // Output shape after MaxPooling2D: (13 13 32 1)
+    cur = ggml_conv_2d(ctx0, model.conv2d_2_kernel, cur, 1, 1, 0, 0, 1, 1);
+    cur = ggml_add(ctx0, cur, model.conv2d_2_bias);
+    cur = ggml_relu(ctx0, cur);
+    // Output shape after Conv2D: (11 11 64 1)
+    cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
+    // Output shape after MaxPooling2D: (5 5 64 1)
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+    // Output shape after permute: (64 5 5 1)
+    cur = ggml_reshape_2d(ctx0, cur, 1600, 1);
+    // Final Dense layer
+    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.dense_weight, cur), model.dense_bias);
+    ggml_tensor * probs = ggml_soft_max(ctx0, cur);
+    ggml_set_name(probs, "probs");
+
+    ggml_build_forward_expand(&gf, probs);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    //ggml_graph_print(&gf);
+    ggml_graph_dump_dot(&gf, NULL, "mnist-cnn.dot");
+
+    if (fname_cgraph) {
+        // export the compute graph for later use
+        // see the "mnist-cpu" example
+        ggml_graph_export(&gf, fname_cgraph);
+
+        fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
+    }
+
+    const float * probs_data = ggml_get_data_f32(probs);
+    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+    ggml_free(ctx0);
+    return prediction;
+}
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/mnist-cnn.gguf models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    mnist_model model;
+    std::vector<float> digit;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!mnist_model_load(argv[1], model)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, argv[1]);
+            return 1;
+        }
+
+        const int64_t t_load_us = ggml_time_us() - t_start_us;
+
+        fprintf(stdout, "%s: loaded model in %8.2f ms\n", __func__, t_load_us / 1000.0f);
+    }
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col] / 255.0f);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(model, 1, digit, nullptr);
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+    ggml_free(model.ctx);
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/main-cpu.cpp b/stable-diffusion.cpp/ggml/examples/mnist/main-cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e1e39801ad88cac978f21c1cb37476a714d0248
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/main-cpu.cpp
@@ -0,0 +1,122 @@
+// Use a pre-generated MNIST compute graph for inference on the CPU
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph with the "mnist-cpu" tool:
+//
+// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// evaluate the MNIST compute graph
+//
+//   - fname_cgraph: path to the compute graph
+//   - n_threads:    number of threads to use
+//   - digit:        784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+        const char * fname_cgraph,
+        const int n_threads,
+        std::vector<float> digit) {
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+
+    // param export/import test
+    GGML_ASSERT(ggml_graph_get_tensor(&gfi, "fc1_bias")->op_params[0] == int(0xdeadbeef));
+
+    // allocate work context
+    // needed during ggml_graph_compute() to allocate a work tensor
+    static size_t buf_size = 128ull*1024*1024; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_work = ggml_init(params);
+
+    struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input");
+    memcpy(input->data, digit.data(), ggml_nbytes(input));
+
+    ggml_graph_compute_with_ctx(ctx_work, &gfi, n_threads);
+
+    const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs"));
+
+    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+
+    ggml_free(ctx_work);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return prediction;
+}
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    std::vector<float> digit;
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col]);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(argv[1], 1, digit);
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.cpp b/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8d47ac9c70c315e66076aa5094fb77f1b044cdf
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.cpp
@@ -0,0 +1,125 @@
+// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool:
+//
+// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include "main-mtl.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+// evaluate the MNIST compute graph
+//
+//   - fname_cgraph: path to the compute graph
+//   - digit:        784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+        const char * fname_cgraph,
+        std::vector<float> digit
+        ) {
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+
+    // allocate work context
+    static size_t buf_size = 128ull*1024*1024; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_work = ggml_init(params);
+
+    // this allocates all Metal resources and memory buffers
+    auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+
+    int prediction = -1;
+
+    for (int i = 0; i < 1; ++i) {
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input");
+
+        if (i % 2 == 0) {
+            memcpy(input->data, digit.data(), ggml_nbytes(input));
+        } else {
+            memset(input->data, 0, ggml_nbytes(input));
+        }
+
+        // the actual inference happens here
+        prediction = mnist_mtl_eval(ctx_mtl, &gf);
+    }
+
+    mnist_mtl_free(ctx_mtl);
+
+    ggml_free(ctx_work);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return prediction;
+}
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    std::vector<float> digit;
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col]);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(argv[1], digit);
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.h b/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e661a4d3457716a5867a71f3359fe8be830c8df
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.h
@@ -0,0 +1,26 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mtl_context;
+
+struct ggml_mtl_context * mnist_mtl_init(
+        struct ggml_context * ctx_data,
+        struct ggml_context * ctx_eval,
+        struct ggml_context * ctx_work,
+        struct ggml_cgraph  * gf);
+
+void mnist_mtl_free(struct ggml_mtl_context * ctx);
+
+int mnist_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.m b/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.m
new file mode 100644
index 0000000000000000000000000000000000000000..4b7717920a69b0017d91135c9f7adf118ec81394
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/main-mtl.m
@@ -0,0 +1,499 @@
+#import "main-mtl.h"
+
+#import "ggml/ggml.h"
+
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+// TODO: couldn't get this to work
+//#define GGML_MTL_HEAP
+
+struct ggml_mtl_context {
+    struct ggml_context * ctx_data;
+    struct ggml_context * ctx_eval;
+    struct ggml_context * ctx_work;
+
+    id<MTLDevice>       device;
+    id<MTLCommandQueue> queue;
+    id<MTLLibrary>      library;
+
+#ifdef GGML_MTL_HEAP
+    id<MTLHeap> heap_data;
+    id<MTLHeap> heap_eval;
+#else
+    id<MTLBuffer> buffer_data;
+    id<MTLBuffer> buffer_eval;
+#endif
+
+    id<MTLBuffer> out;
+
+    // custom kernels
+    id<MTLFunction>             function_add;
+    id<MTLComputePipelineState> pipeline_add;
+
+    id<MTLFunction>             function_relu;
+    id<MTLComputePipelineState> pipeline_relu;
+
+    id<MTLFunction>             function_soft_max;
+    id<MTLComputePipelineState> pipeline_soft_max;
+};
+
+// MSL code
+NSString * const msl_library_mnist = @"\
+#include <metal_stdlib>                                                                 \n\
+using namespace metal;                                                                  \n\
+                                                                                        \n\
+#define MAX(x, y) ((x) > (y) ? (x) : (y))                                               \n\
+                                                                                        \n\
+constant int k_digits [[function_constant(0)]];                                         \n\
+                                                                                        \n\
+kernel void kernel_add(                                                                 \n\
+        device const float * src0,                                                      \n\
+        device const float * src1,                                                      \n\
+        device float * dst,                                                             \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    dst[gid] = src0[gid] + src1[gid];                                                   \n\
+}                                                                                       \n\
+                                                                                        \n\
+kernel void kernel_relu(                                                                \n\
+        device const float * src,                                                       \n\
+        device       float * dst,                                                       \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    dst[gid] = max(0.0f, src[gid]);                                                     \n\
+}                                                                                       \n\
+                                                                                        \n\
+kernel void kernel_soft_max(                                                            \n\
+        device const float * src,                                                       \n\
+        device       float * dst,                                                       \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    float max = 0.0f;                                                                   \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
+        max = MAX(max, src[i]);                                                         \n\
+    }                                                                                   \n\
+    float sum = 0.0f;                                                                   \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
+        dst[i] = exp(src[i] - max);                                                     \n\
+        sum += dst[i];                                                                  \n\
+    }                                                                                   \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
+        dst[i] /= sum;                                                                  \n\
+    }                                                                                   \n\
+}                                                                                       \n\
+";
+
+struct ggml_mtl_context * mnist_mtl_init(
+    struct ggml_context * ctx_data,
+    struct ggml_context * ctx_eval,
+    struct ggml_context * ctx_work,
+    struct ggml_cgraph  * gf) {
+    fprintf(stderr, "%s: allocating\n", __func__);
+
+    struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
+
+    ctx->ctx_data = ctx_data;
+    ctx->ctx_eval = ctx_eval;
+    ctx->ctx_work = ctx_work;
+
+    ctx->device = MTLCreateSystemDefaultDevice();
+    ctx->queue  = [ctx->device newCommandQueue];
+
+    // determine if we can use MPS
+    if (MPSSupportsMTLDevice(ctx->device)) {
+        fprintf(stderr, "%s: using MPS\n", __func__);
+    } else {
+        fprintf(stderr, "%s: not using MPS\n", __func__);
+        GGML_ASSERT(false && "MPS not supported");
+    }
+
+    // compile from source string and show compile log
+    {
+        NSError * error = nil;
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+    }
+
+    // load kernels
+    {
+        const int k_digits = ggml_graph_get_tensor(gf, "probs")->ne[0];
+
+        MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
+        [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
+
+        ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
+        ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
+        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
+
+        ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
+        ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
+        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
+
+        ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
+        ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
+        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
+    }
+
+#ifdef GGML_MTL_HEAP
+    // MTLHeap approach
+
+    // pin ctx_data memory to GPU
+    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
+    // TODO: how to use MTLStorageModeManaged?
+    // TODO: see if we can avoid this copy somehow
+    {
+        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
+        const size_t mem_size   = ggml_get_mem_size(ctx_data);
+
+        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
+        heap_desc.storageMode = MTLStorageModeShared;
+        heap_desc.size        = mem_size;
+
+        printf("heap_desc.size = %zu\n", mem_size);
+
+        ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc];
+        [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
+        ctx->heap_data.label = @"heap_data";
+
+        printf("ctx->heap_data.size = %zu\n", [ctx->heap_data size]);
+
+        id<MTLBuffer> buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+        if (!buffer) {
+            fprintf(stderr, "%s: error: failed to allocate buffer\n", __func__);
+            exit(1);
+        }
+
+        // copy data from CPU to GPU
+        memcpy([buffer contents], mem_buffer, mem_size);
+
+        fprintf(stderr, "%s: allocated data heap, size = %zu\n", __func__, mem_size);
+    }
+
+    // pin ctx_eval memory to GPU
+    // this heap will be used for the intermediate results of the evaluation
+    {
+        const size_t mem_size = ggml_get_mem_size(ctx_eval);
+
+        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
+        heap_desc.storageMode = MTLStorageModePrivate; // GPU only
+        heap_desc.size        = mem_size;
+
+        ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc];
+        [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
+
+        fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size);
+    }
+#else
+    // MTLBuffer approach
+
+    // pin ctx_data memory to GPU
+    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
+    // TODO: how to use MTLStorageModeManaged?
+    // TODO: see if we can avoid this copy somehow
+    {
+        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
+        const size_t mem_size   = ggml_get_mem_size(ctx_data);
+
+        ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
+
+        fprintf(stderr, "%s: allocated data buffer, size = %zu\n", __func__, mem_size);
+    }
+
+    // pin ctx_eval memory to GPU
+    // this buffer will be used for the intermediate results of the evaluation
+    {
+        const size_t mem_size = ggml_get_mem_size(ctx_eval);
+
+        ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate];
+
+        fprintf(stderr, "%s: allocated eval buffer, size = %zu\n", __func__, mem_size);
+    }
+#endif
+
+    // allocate buffer for result extraction
+    {
+        const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
+
+        ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+
+        fprintf(stderr, "%s: allocated out buffer, size = %zu\n", __func__, mem_size);
+    }
+
+    return ctx;
+}
+
+void mnist_mtl_free(struct ggml_mtl_context * ctx) {
+    fprintf(stderr, "%s: deallocating\n", __func__);
+
+    free(ctx);
+}
+
+#ifdef GGML_MTL_HEAP
+
+// make a view of the respective MTL heap
+id<MTLBuffer> mnist_mtl_get_buffer_on_heap(struct ggml_mtl_context * ctx, struct ggml_tensor * t) {
+    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
+    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
+
+    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+
+    const size_t t_size = ggml_nbytes(t);
+    const size_t t_offs = is_data ? offs_data : offs_eval;
+
+    id<MTLBuffer> result;
+
+    if (is_data) {
+        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
+    } else {
+        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
+    }
+
+    if (result == nil) {
+        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+        GGML_ASSERT(false);
+    }
+
+    return result;
+}
+
+#else
+
+// get data / eval buffer + offset
+id<MTLBuffer> mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
+    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
+
+    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+
+    const size_t t_size = ggml_nbytes(t);
+    const size_t t_offs = is_data ? offs_data : offs_eval;
+
+    id<MTLBuffer> result;
+
+    if (is_data) {
+        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = ctx->buffer_data;
+    } else {
+        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = ctx->buffer_eval;
+    }
+
+    if (result == nil) {
+        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+        GGML_ASSERT(false);
+    }
+
+    if (offs != nil) {
+        *offs = t_offs;
+    }
+
+    return result;
+}
+
+#endif
+
+int mnist_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf) {
+    fprintf(stderr, "%s: evaluating\n", __func__);
+
+    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
+    id<MTLComputeCommandEncoder> encoder = nil;
+
+    size_t offs_src0;
+    size_t offs_src1;
+    size_t offs_dst;
+
+    // copy the input data to the GPU
+    {
+        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
+
+        id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0);
+
+        memcpy((char *) id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
+    }
+
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+        switch (gf->nodes[i]->op) {
+            case GGML_OP_ADD:
+                {
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
+                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[1], &offs_src1);
+                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],         &offs_dst);
+
+                    [encoder setComputePipelineState:ctx->pipeline_add];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+            case GGML_OP_UNARY:
+                switch (ggml_get_unary_op(gf->nodes[i])) {
+                    case GGML_UNARY_OP_RELU:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
+                            id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                            [encoder setComputePipelineState:ctx->pipeline_relu];
+                            [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+
+                            const int64_t n = ggml_nelements(gf->nodes[i]);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    default:
+                        {
+                            fprintf(stderr, "%s: node %3d, op = %8s, unary op %d not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op), (int) ggml_get_unary_op(gf->nodes[i]));
+                            GGML_ASSERT(false);
+                            return -1;
+                        }
+                        break;
+                } break;
+            case GGML_OP_SOFT_MAX:
+                {
+#if 0
+                    // NOTE: MPSMatrixSoftMax is not working properly, probably there is a bug
+
+                    if (encoder != nil) {
+                        [encoder endEncoding];
+                        encoder = nil;
+                    }
+
+                    // use MPSMatrixSoftMax
+                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+                    MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src offset:offs_src0 descriptor:desc];
+                    MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst  descriptor:desc];
+
+                    MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
+
+                    [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
+#else
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
+                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    [encoder setComputePipelineState:ctx->pipeline_soft_max];
+                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+#endif
+                } break;
+            case GGML_OP_MUL_MAT:
+                {
+                    if (encoder != nil) {
+                        [encoder endEncoding];
+                        encoder = nil;
+                    }
+
+                    // use MPSMatrixMultiplication
+                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
+                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[1], &offs_src1);
+                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],         &offs_dst);
+
+                    const int64_t ncols0 = gf->nodes[i]->src[0]->ne[0];
+                    const int64_t nrows0 = gf->nodes[i]->src[0]->ne[1];
+
+                    const int64_t ncols1 = gf->nodes[i]->src[1]->ne[0];
+                    const int64_t nrows1 = gf->nodes[i]->src[1]->ne[1];
+
+                    const int64_t ncols2 = gf->nodes[i]->ne[0];
+                    const int64_t nrows2 = gf->nodes[i]->ne[1];
+
+                    GGML_ASSERT(ncols0 == ncols1);
+
+                    MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src[0]->nb[1] dataType:MPSDataTypeFloat32];
+                    MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src[1]->nb[1] dataType:MPSDataTypeFloat32];
+                    MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
+                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
+                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst  descriptor:desc2];
+
+                    MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
+                        transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
+
+                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                } break;
+            default:
+                {
+                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                    GGML_ASSERT(false);
+                    return -1;
+                }
+        }
+    }
+
+    // extract results from the GPU
+    {
+        if (encoder != nil) {
+            [encoder endEncoding];
+            encoder = nil;
+        }
+
+        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
+
+        id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, out, &offs_src0);
+        id<MTLBuffer> id_dst = ctx->out;
+
+        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
+        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
+        [encoder_blit endEncoding];
+    }
+
+    [command_buffer commit];
+    [command_buffer waitUntilCompleted];
+
+    {
+        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
+        fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
+    }
+
+    // select the most probable digit
+    int result = -1;
+    {
+        const float * probs = ctx->out.contents;
+
+        float prob = probs[0];
+
+        for (int i = 0; i < 10; ++i) {
+            fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
+
+            if (probs[i] > prob) {
+                result = i;
+                prob = probs[i];
+            }
+        }
+    }
+
+    return result;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/main.cpp b/stable-diffusion.cpp/ggml/examples/mnist/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33986e4ef573205d7e6f912f632461a0fa16c620
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/main.cpp
@@ -0,0 +1,328 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// default hparams
+struct mnist_hparams {
+    int32_t n_input   = 784;
+    int32_t n_hidden  = 500;
+    int32_t n_classes = 10;
+};
+
+struct mnist_model {
+    mnist_hparams hparams;
+
+    struct ggml_tensor * fc1_weight;
+    struct ggml_tensor * fc1_bias;
+
+    struct ggml_tensor * fc2_weight;
+    struct ggml_tensor * fc2_bias;
+
+    struct ggml_context * ctx;
+};
+
+// load the model's weights from a file
+bool mnist_model_load(const std::string & fname, mnist_model & model) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_input   = hparams.n_input;
+        const int n_hidden  = hparams.n_hidden;
+        const int n_classes = hparams.n_classes;
+
+        ctx_size += n_input * n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 weight
+        ctx_size +=           n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 bias
+
+        ctx_size += n_hidden * n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 weight
+        ctx_size +=            n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 bias
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size + 1024*1024,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // Read FC1 layer 1
+    {
+        // Read dimensions
+        int32_t n_dims;
+        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+
+        {
+            int32_t ne_weight[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne_weight[i]), sizeof(ne_weight[i]));
+            }
+
+            // FC1 dimensions taken from file, eg. 768x500
+            model.hparams.n_input  = ne_weight[0];
+            model.hparams.n_hidden = ne_weight[1];
+
+            model.fc1_weight = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, model.hparams.n_input, model.hparams.n_hidden);
+            fin.read(reinterpret_cast<char *>(model.fc1_weight->data), ggml_nbytes(model.fc1_weight));
+            ggml_set_name(model.fc1_weight, "fc1_weight");
+        }
+
+        {
+            int32_t ne_bias[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne_bias[i]), sizeof(ne_bias[i]));
+            }
+
+            model.fc1_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_hidden);
+            fin.read(reinterpret_cast<char *>(model.fc1_bias->data), ggml_nbytes(model.fc1_bias));
+            ggml_set_name(model.fc1_bias, "fc1_bias");
+
+            // just for testing purposes, set some parameters to non-zero
+            model.fc1_bias->op_params[0] = 0xdeadbeef;
+        }
+    }
+
+    // Read FC2 layer 2
+    {
+        // Read dimensions
+        int32_t n_dims;
+        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+
+        {
+            int32_t ne_weight[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne_weight[i]), sizeof(ne_weight[i]));
+            }
+
+            // FC1 dimensions taken from file, eg. 10x500
+            model.hparams.n_classes = ne_weight[1];
+
+            model.fc2_weight = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, model.hparams.n_hidden, model.hparams.n_classes);
+            fin.read(reinterpret_cast<char *>(model.fc2_weight->data), ggml_nbytes(model.fc2_weight));
+            ggml_set_name(model.fc2_weight, "fc2_weight");
+        }
+
+        {
+            int32_t ne_bias[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne_bias[i]), sizeof(ne_bias[i]));
+            }
+
+            model.fc2_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_classes);
+            fin.read(reinterpret_cast<char *>(model.fc2_bias->data), ggml_nbytes(model.fc2_bias));
+            ggml_set_name(model.fc2_bias, "fc2_bias");
+        }
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// evaluate the model
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - digit:     784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+        const mnist_model & model,
+        const int n_threads,
+        std::vector<float> digit,
+        const char * fname_cgraph
+        ) {
+
+    const auto & hparams = model.hparams;
+
+    static size_t buf_size = hparams.n_input * sizeof(float) * 4;
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hparams.n_input);
+    memcpy(input->data, digit.data(), ggml_nbytes(input));
+    ggml_set_name(input, "input");
+
+    // fc1 MLP = Ax + b
+    ggml_tensor * fc1 = ggml_add(ctx0, ggml_mul_mat(ctx0, model.fc1_weight, input),                model.fc1_bias);
+    ggml_tensor * fc2 = ggml_add(ctx0, ggml_mul_mat(ctx0, model.fc2_weight, ggml_relu(ctx0, fc1)), model.fc2_bias);
+
+    // soft max
+    ggml_tensor * probs = ggml_soft_max(ctx0, fc2);
+    ggml_set_name(probs, "probs");
+
+    // build / export / run the computation graph
+    ggml_build_forward_expand(&gf, probs);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    //ggml_graph_print   (&gf);
+    ggml_graph_dump_dot(&gf, NULL, "mnist.dot");
+
+    if (fname_cgraph) {
+        // export the compute graph for later use
+        // see the "mnist-cpu" example
+        ggml_graph_export(&gf, "mnist.ggml");
+
+        fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
+    }
+
+    const float * probs_data = ggml_get_data_f32(probs);
+
+    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+
+    ggml_free(ctx0);
+
+    return prediction;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int wasm_eval(uint8_t * digitPtr) {
+    mnist_model model;
+    if (!mnist_model_load("models/mnist/ggml-model-f32.bin", model)) {
+        fprintf(stderr, "error loading model\n");
+        return -1;
+    }
+    std::vector<float> digit(digitPtr, digitPtr + 784);
+    int result = mnist_eval(model, 1, digit, nullptr);
+    ggml_free(model.ctx);
+
+    return result;
+}
+
+int wasm_random_digit(char * digitPtr) {
+    auto fin = std::ifstream("models/mnist/t10k-images.idx3-ubyte", std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "failed to open digits file\n");
+        return 0;
+    }
+    srand(time(NULL));
+
+    // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+    fin.seekg(16 + 784 * (rand() % 10000));
+    fin.read(digitPtr, 784);
+
+    return 1;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/ggml-model-f32.bin models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    mnist_model model;
+    std::vector<float> digit;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!mnist_model_load(argv[1], model)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "models/ggml-model-f32.bin");
+            return 1;
+        }
+
+        const int64_t t_load_us = ggml_time_us() - t_start_us;
+
+        fprintf(stdout, "%s: loaded model in %8.2f ms\n", __func__, t_load_us / 1000.0f);
+    }
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col]);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(model, 1, digit, "mnist.ggml");
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/mnist-cnn.py b/stable-diffusion.cpp/ggml/examples/mnist/mnist-cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..35dda60ab9e32c9622d72b3ecfecd3c1a7b4fc30
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/mnist-cnn.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+import sys
+import gguf
+import numpy as np
+from tensorflow import keras
+from tensorflow.keras import layers
+
+def train(model_name):
+    # Model / data parameters
+    num_classes = 10
+    input_shape = (28, 28, 1)
+
+    # Load the data and split it between train and test sets
+    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
+
+    # Scale images to the [0, 1] range
+    x_train = x_train.astype("float32") / 255
+    x_test = x_test.astype("float32") / 255
+    # Make sure images have shape (28, 28, 1)
+    x_train = np.expand_dims(x_train, -1)
+    x_test = np.expand_dims(x_test, -1)
+    print("x_train shape:", x_train.shape)
+    print(x_train.shape[0], "train samples")
+    print(x_test.shape[0], "test samples")
+
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+
+    model = keras.Sequential(
+        [
+            keras.Input(shape=input_shape),
+            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
+            layers.MaxPooling2D(pool_size=(2, 2)),
+            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
+            layers.MaxPooling2D(pool_size=(2, 2)),
+            layers.Flatten(),
+            layers.Dropout(0.5),
+            layers.Dense(num_classes, activation="softmax"),
+        ]
+    )
+
+    model.summary()
+    batch_size = 128
+    epochs = 15
+    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
+
+    score = model.evaluate(x_test, y_test, verbose=0)
+    print("Test loss:", score[0])
+    print("Test accuracy:", score[1])
+    model.save(model_name)
+    print("Keras model saved to '" + model_name + "'")
+
+def convert(model_name):
+    model = keras.models.load_model(model_name)
+    gguf_model_name = model_name + ".gguf"
+    gguf_writer = gguf.GGUFWriter(gguf_model_name, "mnist-cnn")
+
+    kernel1 = model.layers[0].weights[0].numpy()
+    kernel1 = np.moveaxis(kernel1, [2,3], [0,1])
+    kernel1 = kernel1.astype(np.float16)
+    gguf_writer.add_tensor("kernel1", kernel1, raw_shape=(32, 1, 3, 3))
+
+    bias1 = model.layers[0].weights[1].numpy()
+    bias1 = np.repeat(bias1, 26*26)
+    gguf_writer.add_tensor("bias1", bias1, raw_shape=(1, 32, 26, 26))
+
+    kernel2 = model.layers[2].weights[0].numpy()
+    kernel2 = np.moveaxis(kernel2, [0,1,2,3], [2,3,1,0])
+    kernel2 = kernel2.astype(np.float16)
+    gguf_writer.add_tensor("kernel2", kernel2, raw_shape=(64, 32, 3, 3))
+
+    bias2 = model.layers[2].weights[1].numpy()
+    bias2 = np.repeat(bias2, 11*11)
+    gguf_writer.add_tensor("bias2", bias2, raw_shape=(1, 64, 11, 11))
+
+    dense_w = model.layers[-1].weights[0].numpy()
+    dense_w = dense_w.transpose()
+    gguf_writer.add_tensor("dense_w", dense_w, raw_shape=(10, 1600))
+
+    dense_b = model.layers[-1].weights[1].numpy()
+    gguf_writer.add_tensor("dense_b", dense_b)
+
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+    print("Model converted and saved to '{}'".format(gguf_model_name))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print("Usage: %s <train|convert> <model_name>".format(sys.argv[0]))
+        sys.exit(1)
+    if sys.argv[1] == 'train':
+        train(sys.argv[2])
+    elif sys.argv[1] == 'convert':
+        convert(sys.argv[2])
+    else:
+        print("Usage: %s <train|convert> <model_name>".format(sys.argv[0]))
+        sys.exit(1)
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/.gitignore b/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c48868a26491c2f8a83460f2a8ca5bd0c3ce97b9
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/.gitignore
@@ -0,0 +1 @@
+ggml-model-f32.bin
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/mnist_model.state_dict b/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/mnist_model.state_dict
new file mode 100644
index 0000000000000000000000000000000000000000..ce64cd78385f863ae7e054b526e8075eaaccce0b
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/mnist_model.state_dict
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8a25252e28915e147720c19223721f0f53e3317493727ca754a2dd672450ba9
+size 1591571
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte b/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte
new file mode 100644
index 0000000000000000000000000000000000000000..d026debec174b65df0cd4d448668d0c744497faa
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fa7898d509279e482958e8ce81c8e77db3f2f8254e26661ceb7762c4d494ce7
+size 7840016
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/web/.gitignore b/stable-diffusion.cpp/ggml/examples/mnist/web/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..72e8ffc0db8aad71a934dd11e5968bd5109e54b4
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/web/.gitignore
@@ -0,0 +1 @@
+*
diff --git a/stable-diffusion.cpp/ggml/examples/mnist/web/index.html b/stable-diffusion.cpp/ggml/examples/mnist/web/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..ab1ef1778becd0eedd2ae03ec7a5e10f1999c63a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mnist/web/index.html
@@ -0,0 +1,178 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <title>MNIST with GGML</title>
+    <script src="mnist.js"></script>
+</head>
+<body>
+    <h2>MNIST digit recognizer with <a href="https://github.com/ggerganov/ggml">GGML</a></h2>
+    <p id="msg">Loading model and data set, please wait ...</p>
+    <canvas id="ggCanvas" width="364" height="364" style="border:2px solid #d3d3d3;">
+        Your browser does not support the HTML canvas tag.
+    </canvas>
+    <div>
+        <button id="clear" onclick="onClear()">Clear</button>
+        <button id="random" onclick="onRandom()" disabled>Random</button>
+        <button id="download" onclick="onDownload()">Download</button>
+    </div>
+    <div>
+        <p id="prediction"></p>
+    </div>
+    <script>
+"use strict";
+const DIGIT_SIZE = 28; // digits are 28x28 pixels
+var canvas = document.getElementById("ggCanvas");
+var ctx = canvas.getContext("2d", { alpha: false, willReadFrequently: true });
+ctx.fillStyle = "white";
+ctx.fillRect(0, 0, canvas.width, canvas.height);
+var dragging = false;
+var lastX, lastY;
+
+function onClear(event) {
+    ctx.fillStyle = "white";
+    ctx.fillRect(0, 0, canvas.width, canvas.height);
+    document.getElementById("prediction").innerHTML = "";
+}
+
+function predict(digit) {
+    let buf = Module._malloc(digit.length);
+    if (buf == 0) {
+        console.log("failed to allocate memory");
+        return;
+    }
+    Module.HEAPU8.set(digit, buf);
+    let prediction = Module.ccall('wasm_eval', null, ['number'], [buf]);
+    Module._free(buf);
+    if (prediction >= 0) {
+        document.getElementById("prediction").innerHTML = "Predicted digit is <b>" + prediction + "</b>";
+    }
+}
+
+function onRandom(event) {
+    onClear();
+    const bufLength = DIGIT_SIZE*DIGIT_SIZE;
+    var buf = Module._malloc(bufLength);
+    if (buf == 0) {
+        console.log("failed to allocate memory");
+        return;
+    }
+    let ret = Module.ccall('wasm_random_digit', null, ['number'], [buf]);
+    let digit = new Uint8Array(Module.HEAPU8.buffer, buf, bufLength);
+    for (let i = 0; i < digit.length; i++) {
+        let x = i % DIGIT_SIZE;
+        let y = Math.floor(i / DIGIT_SIZE);
+        setPixel(x, y, digit[i]);
+    }
+    Module._free(buf);
+    predict(digit);
+}
+
+function onDownload(event) {
+    let digit = scaleCanvas();
+    let digitBlob = new Blob([new Uint8Array(digit)], {type: "application/octet-stream"});
+    let url = URL.createObjectURL(digitBlob);
+    let link = document.createElement('a');
+    link.href = url;
+    link.download = "image.raw";
+    document.body.appendChild(link);
+    link.click();
+    document.body.removeChild(link);
+}
+
+// Get the position of the mouse relative to the canvas
+function getMousePos(event) {
+    if (event.touches !== undefined && event.touches.length > 0) {
+        event = event.touches[0];
+    }
+    var rect = canvas.getBoundingClientRect();
+    return [Math.floor(event.clientX) - rect.left, Math.floor(event.clientY) - rect.top];
+}
+
+function setPixel(x, y, val) {
+    let canvasX = x * 13;
+    let canvasY = y * 13;
+    let color = 255 - val;
+    ctx.fillStyle = "#" + color.toString(16) + color.toString(16) + color.toString(16);
+    ctx.fillRect(canvasX, canvasY, 13, 13);
+}
+
+function onMouseDown(e) {
+    dragging = true;
+    [lastX, lastY] = getMousePos(e);
+}
+
+// scale the canvas to 28x28 pixels and return the pixel values as an array
+function scaleCanvas() {
+    let imgData = ctx.getImageData(0, 0, canvas.width, canvas.height);
+    let tempCanvas = document.createElement('canvas');
+    tempCanvas.width = DIGIT_SIZE;
+    tempCanvas.height = DIGIT_SIZE;
+    let tempCtx = tempCanvas.getContext("2d");
+    tempCtx.drawImage(canvas, 0, 0, DIGIT_SIZE, DIGIT_SIZE);
+    let tempImgData = tempCtx.getImageData(0, 0, DIGIT_SIZE, DIGIT_SIZE);
+    let tempData = tempImgData.data;
+    let digit = new Array(DIGIT_SIZE*DIGIT_SIZE).fill(0);
+    for (let i = 0; i < tempData.length; i += 4) {
+        let val = 255 - tempData[i];
+        digit[i / 4] = val;
+    }
+    return digit;
+}
+
+function onMouseUp(e) {
+    dragging = false;
+    let digit = scaleCanvas();
+    predict(digit);
+}
+
+function onMouseMove(e) {
+    if (dragging) {
+        let [mouseX, mouseY] = getMousePos(e);
+        ctx.beginPath();
+        ctx.moveTo(lastX, lastY);
+        ctx.lineTo(mouseX, mouseY);
+        ctx.lineWidth = 20;
+        ctx.lineJoin = ctx.lineCap = 'round';
+        ctx.strokeStyle = "#000000";
+        ctx.stroke();
+        ctx.closePath();
+        lastX = mouseX;
+        lastY = mouseY;
+    }
+}
+
+// Prevent scrolling when touching the canvas
+document.body.addEventListener("touchstart", function (e) {
+if (e.target == canvas) {
+    e.preventDefault();
+}
+}, {passive: false});
+document.body.addEventListener("touchend", function (e) {
+if (e.target == canvas) {
+    e.preventDefault();
+}
+}, {passive: false});
+document.body.addEventListener("touchmove", function (e) {
+if (e.target == canvas) {
+    e.preventDefault();
+}
+}, {passive: false});
+
+function onRuntimeInitialized() {
+    // Use the same handlers for mouse and touch events
+    canvas.onmousedown = onMouseDown;
+    canvas.onmouseup = onMouseUp;
+    canvas.onmousemove = onMouseMove;
+    canvas.ontouchstart = onMouseDown;
+    canvas.ontouchend = onMouseUp;
+    canvas.ontouchmove = onMouseMove;
+    document.getElementById("msg").innerHTML = "Draw a single digit on the canvas below:"
+    document.getElementById("random").disabled = false;
+}
+
+Module['onRuntimeInitialized'] = onRuntimeInitialized;
+    </script>
+</body>
+</html>
diff --git a/stable-diffusion.cpp/ggml/examples/mpt/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/mpt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..09408f9fcdca9c2477bd1bb525be5cf47631e652
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mpt/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# mpt
+
+set(TEST_TARGET mpt)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# mpt-quantize
+
+set(TEST_TARGET mpt-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/stable-diffusion.cpp/ggml/examples/mpt/README.md b/stable-diffusion.cpp/ggml/examples/mpt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..39f46bae317ad4fcdb3f5b9e532fce3822fc7415
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mpt/README.md
@@ -0,0 +1,27 @@
+# MPT
+
+Ref: https://github.com/mosaicml/llm-foundry#mpt
+
+## Usage
+
+```bash
+# get the repo and build it
+git clone https://github.com/ggerganov/ggml
+cd ggml
+mkdir build && cd build
+cmake ..
+make -j
+
+# get the model from HuggingFace
+# be sure to have git-lfs installed
+git clone https://huggingface.co/mosaicml/mpt-30b
+
+# convert model to FP16
+python3 ../examples/mpt/convert-h5-to-ggml.py ./mpt-30b 1
+
+# run inference using FP16 precision
+./bin/mpt -m ./mpt-30b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
+
+# quantize the model to 5-bits using Q5_0 quantization
+./bin/mpt-quantize ./mpt-30b/ggml-model-f16.bin ./mpt-30b/ggml-model-q5_0.bin q5_0
+```
diff --git a/stable-diffusion.cpp/ggml/examples/mpt/convert-h5-to-ggml.py b/stable-diffusion.cpp/ggml/examples/mpt/convert-h5-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd6459feed06e2eb4248d874b8003ea920a3841
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mpt/convert-h5-to-ggml.py
@@ -0,0 +1,169 @@
+import os
+import struct
+import sys
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+
+    cs = [chr(n) for n in cs]
+
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    """Returns the number of model parts in the model directory."""
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print(f"Found {num_parts} model parts in {dir_model}")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+    fname_out = dir_model + "/ggml-model-" + ftype_str[ftype] + ".bin"
+
+
+tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
+hparams = config.to_dict()
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
+fout.write(struct.pack("i", hparams["d_model"]))
+fout.write(struct.pack("i", hparams["max_seq_len"]))
+fout.write(struct.pack("i", hparams["n_heads"]))
+fout.write(struct.pack("i", hparams["n_layers"]))
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("f", hparams["attn_config"]["alibi_bias_max"]))
+fout.write(struct.pack("f", hparams["attn_config"]["clip_qkv"] or 0.0))
+fout.write(struct.pack("i", ftype))
+
+vocab_size = hparams["vocab_size"]
+
+encoder = tokenizer.vocab
+# Add added_tokens (special tokens) to the encoder
+encoder.update(tokenizer.get_added_vocab())
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+counter = 0
+# sort by value
+for key in sorted(encoder, key=encoder.get):
+    # workaround for key error when c not found
+    text = ""
+    for c in key:
+        if c not in byte_decoder:
+            text += c
+        else:
+            text += chr(byte_decoder[c])
+    text = bytearray(text, encoding="utf-8")
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    counter += 1
+
+# Repeat last token until vocab_size
+while counter < vocab_size:
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    counter += 1
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print(f"\n* Loading part: {part_name}")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name].squeeze()
+        n_dims = len(data.shape)
+
+        # ftype == 0 -> float32, ftype == 1 -> float16
+        # default type is fp32
+        ftype_cur = 0
+        if ftype == 1 and name[-7:] == ".weight" and n_dims > 1:
+            ftype_cur = 1
+        data = data.to(dtype=torch.float16 if ftype_cur == 1 else torch.float32).numpy()
+
+        print(
+            "Processing variable: " + name + " with shape: ",
+            data.shape,
+            "->",
+            data.dtype,
+        )
+
+        # header
+        str = name.encode("utf-8")
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    # release memory
+    del model_part
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/mpt/main.cpp b/stable-diffusion.cpp/ggml/examples/mpt/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b2fa02c22dbe0445570a03de826303b187d506d
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mpt/main.cpp
@@ -0,0 +1,1039 @@
+#include "ggml/ggml.h"
+
+#include "common-ggml.h"
+#include "common.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// no defaults for now
+struct mpt_hparams {
+    int32_t d_model      = 0;
+    int32_t max_seq_len  = 0;
+    int32_t n_heads      = 0;
+    int32_t n_layers     = 0;
+    int32_t n_vocab      = 0;
+    float alibi_bias_max = 0;
+    float clip_qkv       = 0;
+    int32_t ftype        = 0;
+    int32_t n_ctx        = 0;
+
+};
+
+struct mpt_layer {
+    // pre normalization
+    struct ggml_tensor * norm_1_weight;
+
+    // attention
+    struct ggml_tensor * c_attn_wqkv_weight;
+    struct ggml_tensor * c_attn_out_proj_weight;
+
+    // post normalization
+    struct ggml_tensor * norm_2_weight;
+
+    // ff
+    struct ggml_tensor * ffn_up_proj;
+    struct ggml_tensor * ffn_down_proj;
+};
+
+struct mpt_model {
+    mpt_hparams hparams;
+
+    struct ggml_tensor * wte_weight;    // position embedding
+    struct ggml_tensor * norm_f_weight; // language model head
+
+    std::vector<mpt_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+struct mpt_params {
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+
+    int32_t seed           = -1; // RNG seed
+    int32_t n_predict      = 200; // new tokens to predict
+    int32_t n_batch        = 8; // batch size for prompt processing
+    int32_t n_ctx          = 512;
+
+    std::string model      = ""; // model path
+    std::string prompt     = "";
+    std::string token_test = "";
+
+    bool    perplexity     = false;
+
+    // sampling parameters
+    int32_t top_k          = 0;
+    float   top_p          = 1.0f;
+    float   temp           = 0.8f;
+    int32_t repeat_last_n  = 64;
+    float   repeat_penalty = 1.02f;
+
+};
+
+void mpt_print_usage(int /*argc*/, char ** argv, const mpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stderr, "                        load prompt from a file\n");
+    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
+    fprintf(stderr, "                        test tokenization\n");
+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
+    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = n_vocab)\n", params.top_k);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.2f)\n", params.top_p);
+    fprintf(stderr, "  --temp N              temperature (default: %.2f)\n", params.temp);
+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
+    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "\n");
+}
+
+bool mpt_params_parse(int argc, char ** argv, mpt_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            params.prompt = argv[++i];
+        } else if (arg == "-n" || arg == "--n_predict") {
+            params.n_predict = std::stoi(argv[++i]);
+        } else if (arg == "--top_k") {
+            params.top_k = std::max(1, std::stoi(argv[++i]));
+        } else if (arg == "--top_p") {
+            params.top_p = std::stof(argv[++i]);
+        } else if (arg == "--temp") {
+            params.temp = std::stof(argv[++i]);
+        } else if (arg == "--repeat-last-n") {
+            params.repeat_last_n = std::stof(argv[++i]);
+        } else if (arg == "--repeat-penalty") {
+            params.repeat_penalty = std::stof(argv[++i]);
+        } else if (arg == "--perplexity") {
+            params.perplexity = true;
+        } else if (arg == "-c" || arg == "--ctx-size") {
+            params.n_ctx = std::stoi(argv[++i]);
+        } else if (arg == "-b" || arg == "--batch_size") {
+            params.n_batch = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            mpt_print_usage(argc, argv, params);
+            exit(0);
+        } else if (arg == "-f" || arg == "--file") {
+            if (++i > argc) {
+                fprintf(stderr, "Invalid file param");
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                break;
+            }
+            params.prompt.clear();
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        } else if (arg == "-tt" || arg == "--token_test") {
+            params.token_test = argv[++i];
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            mpt_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+// load the model's weights from a file
+bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *)&magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.d_model,        sizeof(hparams.d_model));
+        fin.read((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
+        fin.read((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
+        fin.read((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
+        fin.read((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
+        fin.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
+        fin.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
+
+        hparams.n_ctx = std::min(hparams.max_seq_len, hparams.n_ctx);
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: d_model        = %d\n", __func__, hparams.d_model);
+        printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
+        printf("%s: n_ctx          = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_heads        = %d\n", __func__, hparams.n_heads);
+        printf("%s: n_layers       = %d\n", __func__, hparams.n_layers);
+        printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
+        printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
+        printf("%s: clip_qkv       = %f\n", __func__, hparams.clip_qkv);
+        printf("%s: ftype          = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr          = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            // Convert token from utf-8
+            std::wstring word_multibytes = convert_to_wstring(word);
+            word.resize(word_multibytes.size());
+            for (size_t w = 0; w < word_multibytes.size(); w++) {
+                word[w] = uint8_t(word_multibytes[w]);
+            }
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit
+    // floats or quantized in order to save memory and also to speed up the
+    // computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
+                model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    const auto & hparams = model.hparams;
+    const size_t n_ctx = hparams.n_ctx;
+
+    {
+        const size_t n_embd = hparams.d_model;
+        const size_t n_layer = hparams.n_layers;
+        const size_t n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight
+        ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32);   // norm_f_weight
+
+        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_1_weight
+        ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight
+        ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype));     // attn_out_proj_weight
+        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_2_weight
+        ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
+        ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
+
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
+
+        ctx_size += (1 + 6 * n_layer) * 512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const size_t n_embd = hparams.d_model;
+        const size_t n_layer = hparams.n_layers;
+        const size_t n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.wte_weight    = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+        model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        // map by name
+        model.tensors["transformer.wte.weight"]    = model.wte_weight;
+        model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
+
+        for (int i = 0; i < (int) n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.norm_1_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
+            layer.c_attn_wqkv_weight     = ggml_new_tensor_2d(ctx, wtype,             n_embd, 3 * n_embd);
+            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype,             n_embd,     n_embd);
+            layer.norm_2_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
+            layer.ffn_up_proj            = ggml_new_tensor_2d(ctx, wtype,             n_embd, 4 * n_embd);
+            layer.ffn_down_proj          = ggml_new_tensor_2d(ctx, wtype,         4 * n_embd,     n_embd);
+
+            // map by name
+            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"]        = layer.norm_1_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"]     = layer.c_attn_wqkv_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_out_proj_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"]        = layer.norm_2_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"]   = layer.ffn_up_proj;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const size_t n_embd  = hparams.d_model;
+        const size_t n_layer = hparams.n_layers;
+
+        const int64_t n_mem      = n_layer * n_ctx;
+        const int64_t n_elements = n_embd  * n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = {1, 1};
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr,
+                        "%s: tensor '%s' has wrong shape in model file: got [%5d, "
+                        "%5d], expected [%5d, %5d]\n",
+                        __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1],
+                       ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr,
+                        "%s: tensor '%s' has wrong size in model file: got %zu, "
+                        "expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
+              const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all, size_t & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.d_model;
+    const int n_layer = hparams.n_layers;
+    const int n_head  = hparams.n_heads;
+    const int n_vocab = hparams.n_vocab;
+    const int n_ctx   = hparams.n_ctx;
+    const float eps   = 1e-5f;
+
+    static size_t buf_size = 256u * 1024 * 1024;
+    static void * buf = malloc(buf_size);
+
+    // use 2 scratch buffers
+    // TODO: very hacky solution - reimplement in a more elegant way
+    static size_t scr0_size = 256u*1024*1024;
+    static void * scr0 = malloc(scr0_size);
+
+    static size_t scr1_size = 256u*1024*1024;
+    static void * scr1 = malloc(scr1_size);
+
+    if (mem_per_token > 0 && mem_per_token * N > buf_size) {
+        const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
+        // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
+        // buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
+
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+
+        struct ggml_tensor * cur;
+
+        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
+        // a = self.ln_1(x)
+        {
+            cur = ggml_norm(ctx0, inpL, eps);
+
+            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
+        }
+
+        // self-attention
+        //  b, _, past_key_value = self.attn(a, past_key_value=past_key_value,
+        //  attn_bias=attn_bias, attention_mask=attention_mask,
+        //  is_causal=is_causal)
+        {
+            // compute QKV
+            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
+
+            if (model.hparams.clip_qkv > 0.0f) {
+                cur = ggml_clamp(ctx0, cur, -model.hparams.clip_qkv, model.hparams.clip_qkv);
+            }
+
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
+
+            // store key and value to memory
+            {
+                struct ggml_tensor * k =
+                    ggml_view_1d(ctx0, model.memory_k, N * n_embd,
+                                 (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
+                struct ggml_tensor * v =
+                    ggml_view_1d(ctx0, model.memory_v, N * n_embd,
+                                 (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
+            // 2, 1, 3) [64, N, 12]
+            struct ggml_tensor * Q = ggml_permute(
+                ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
+                1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
+            // 3) [64, n_past + N, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                             ggml_reshape_3d(ctx0,
+                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
+                                                          il * n_ctx * ggml_element_size(model.memory_k) * n_embd),
+                                             n_embd / n_head, n_head, n_past + N),
+                             0, 2, 1, 3);
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head)));
+
+            struct ggml_tensor * KQ_scaled_alibi =
+                ggml_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
+            // 2, 0, 3).contiguous() [n_past + N, 64, 12]
+            struct ggml_tensor * V_trans = ggml_cpy(
+                ctx0,
+                ggml_permute(ctx0,
+                             ggml_reshape_3d(ctx0,
+                                             ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
+                                                          il * n_ctx * ggml_element_size(model.memory_v) * n_embd),
+                                             n_embd / n_head, n_head, n_past + N),
+                             1, 2, 0, 3),
+                ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection
+            { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
+        }
+
+        inpL = ggml_add(ctx0, inpL, cur);
+
+        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+
+        // m = self.ln_2(x)
+        {
+            cur = ggml_norm(ctx0, inpL, eps);
+
+            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
+        }
+
+        // n = self.mlp(m)
+        {
+
+            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // cur = proj_w*cur + proj_b
+            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
+        }
+
+        // x = x + n
+        inpL = ggml_add(ctx0, inpL, cur);
+    }
+
+    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, eps);
+        // inpL = ln_f_g*inpL
+        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
+    }
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    // output embedding weight tied to input embedding
+    inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
+
+    // logits -> probs
+    // inpL = ggml_soft_max(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    // std::cout << "Qcur" << std::endl;
+    // print_tensor(Qcur);
+
+    // if (n_past%100 == 0) {
+    // ggml_graph_print(&gf);
+    // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot");
+    // }
+
+    if (logits_all) {
+        // return result for all tokens
+        embd_w.resize(n_vocab *N);
+        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) , sizeof(float) * n_vocab * N);
+    } else {
+        // return result for just the last token
+        embd_w.resize(n_vocab);
+        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
+    }
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0) / N;
+    }
+    // printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+std::vector<float> softmax(const std::vector<float> & logits) {
+    std::vector<float> probs(logits.size());
+    float max_logit = logits[0];
+    for (float v : logits) max_logit = std::max(max_logit, v);
+    double sum_exp = 0.0;
+    for (size_t i = 0; i < logits.size(); i++) {
+        // Subtract the maximum logit value from the current logit value for numerical stability
+        const float logit = logits[i] - max_logit;
+        const float exp_logit = expf(logit);
+        sum_exp += exp_logit;
+        probs[i] = exp_logit;
+    }
+    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+    return probs;
+}
+
+int perplexity(const mpt_params & params) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    printf("%s: n_threads = %d\n", __func__, params.n_threads);
+    printf("%s: n_batch   = %d\n", __func__, params.n_batch);
+    printf("%s: n_ctx     = %d\n", __func__, params.n_ctx);
+    printf("\n");
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    mpt_model model;
+
+    model.hparams.n_ctx = params.n_ctx;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!mpt_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+    }
+
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<int> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+    mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
+
+    int count   = 0;
+
+    const int n_chunk = embd_inp.size() / params.n_ctx;
+
+    const int n_vocab = model.hparams.n_vocab;
+    const int n_batch = params.n_batch;
+
+    double nll = 0.0;
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+
+    for (int i = 0; i < n_chunk; ++i) {
+
+        const int start =     i * params.n_ctx;
+        const int end   = start + params.n_ctx;
+
+        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
+
+        std::vector<float> logits;
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        for (int j = 0; j < num_batches; ++j) {
+
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            std::vector<gpt_vocab::id> embd;
+
+            for(int p=0;p<batch_size;p++) {
+                embd.push_back( embd_inp[batch_start+p]  );
+            }
+
+            std::vector<float> batch_logits;// = llama_get_logits(ctx);
+
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!mpt_eval(model, params.n_threads, j * batch_size, embd, batch_logits, true, mem_per_token)) {
+                printf("%s: failed to evaluate model\n", __func__);
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+
+            logits.insert(logits.end(), batch_logits.data(), batch_logits.data() + batch_size * n_vocab);
+
+        }
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            fprintf(stderr, "%d minutes\n", total_seconds / 60);
+
+            printf("\nChunk\tPPL cumulative\tPPL chunk\n");
+        }
+
+        // We get the logits for all the tokens in the context window (params.n_ctx)
+        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
+        // calculate the perplexity over the last half of the window (so the model always has
+        // some context to predict the token).
+        //
+        // We rely on the fact that attention in the forward pass only looks at previous
+        // tokens here, so the logits returned for each token are an accurate representation
+        // of what the model would have predicted at that point.
+        //
+        // Example, we have a context window of 512, we will compute perplexity for each of the
+        // last 256 tokens.  Then, we split the input up into context window size chunks to
+        // process the entire prompt.
+
+        double nllchunk = 0.0;
+        int countchunk = 0;
+
+        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
+            // Calculate probability of next token, given the previous ones.
+            const std::vector<float> tok_logits(
+                logits.begin() + (j + 0) * n_vocab,
+                logits.begin() + (j + 1) * n_vocab);
+
+            const float prob = softmax(tok_logits)[embd_inp[ start+ j + 1]];
+
+            nllchunk += -std::log(prob);
+            ++countchunk;
+        }
+
+		nll += nllchunk;
+		count += countchunk;
+
+        // perplexity is e^(average negative log-likelihood)
+        printf("%d\t%.8lf\t%.8lf\n", i + 1, std::exp(nll / count), std::exp(nllchunk/countchunk) );
+        fflush(stdout);
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:     load time = %8.2f ms\n",   __func__, t_load_us / 1000.0f);
+        printf("%s:     eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / (n_chunk * params.n_ctx));
+        printf("%s:    total time = %8.2f ms\n",   __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    mpt_params params;
+
+    if (mpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.perplexity) {
+        return perplexity(params);
+    }
+
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    if (params.n_predict < 0) {
+        params.n_predict = 0;
+    }
+
+    printf("%s: seed      = %d\n",   __func__, params.seed);
+    printf("%s: n_threads = %d\n",   __func__, params.n_threads);
+    printf("%s: n_batch   = %d\n",   __func__, params.n_batch);
+    printf("%s: n_ctx     = %d\n",   __func__, params.n_ctx);
+    printf("%s: n_predict = %d\n\n", __func__, params.n_predict);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    mpt_model model;
+
+    model.hparams.n_ctx = params.n_ctx;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!mpt_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+    if (params.top_k == 0) {
+        params.top_k = model.hparams.n_vocab;
+    }
+
+    if (params.repeat_last_n == -1) {
+        params.repeat_last_n = params.n_ctx;
+    }
+
+    printf("\n");
+    printf("%s: temp           = %.3f\n", __func__, params.temp);
+    printf("%s: top_k          = %d\n",   __func__, params.top_k);
+    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
+    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
+    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
+
+    int64_t t_sample_us = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<int32_t> last_n_tokens(params.n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
+    // tokenize the prompt
+    std::vector<int> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    printf("\n");
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+
+    for (size_t i = 0; i < embd_inp.size(); i++) {
+        printf("%s: token[%zu] = %6d\n", __func__, i, embd_inp[i]);
+    }
+    printf("\n");
+
+    std::vector<gpt_vocab::id> embd;
+    std::vector<float> logits;
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+    mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
+
+    int n_past     = 0;
+    int n_consumed = 0;
+    int n_sampled  = 0;
+
+    while (n_sampled < params.n_predict) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!mpt_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) {
+                printf("%s: failed to predict\n", __func__);
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+
+            n_past += embd.size();
+            embd.clear();
+        }
+
+        if ((int)embd_inp.size() <= n_consumed) {
+            // sample next token
+
+            const int top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp = params.temp;
+            const int repeat_last_n = params.repeat_last_n;
+            const float repeat_penalty = params.repeat_penalty;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - model.hparams.n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(id);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+            ++n_sampled;
+
+        } else {
+            // if here, it means we are still processing the input prompt
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[n_consumed]);
+
+                ++n_consumed;
+                if ((int) embd.size() >= params.n_batch) {
+                    break;
+                }
+            }
+        }
+
+        // display text
+        for (auto id : embd) {
+           printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 0) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n\n");
+        printf("%s: sampled tokens = %8d\n", __func__, n_sampled);
+        printf("%s:  mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:      load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
+        printf("%s:    sample time = %8.2f ms / %.2f ms per token\n", __func__, t_sample_us / 1000.0f, t_sample_us / 1000.0f / n_sampled);
+        printf("%s:      eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
+        printf("%s:     total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/mpt/quantize.cpp b/stable-diffusion.cpp/ggml/examples/mpt/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0c9dda8229ac0abd7346c06abd9380a9e9e1e00
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/mpt/quantize.cpp
@@ -0,0 +1,186 @@
+#include "ggml/ggml.h"
+
+#include "common-ggml.h"
+#include "common.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <string>
+#include <vector>
+
+struct mpt_hparams {
+    int32_t d_model      = 0;
+    int32_t max_seq_len  = 0;
+    int32_t n_heads      = 0;
+    int32_t n_layers     = 0;
+    int32_t n_vocab      = 0;
+    float alibi_bias_max = 0;
+    float clip_qkv       = 0;
+    int32_t ftype        = 0;
+};
+
+// quantize a model
+bool mpt_model_quantize(const std::string & fname_inp,
+                        const std::string & fname_out, ggml_ftype ftype) {
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__,
+                fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__,
+                fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *)&magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n",
+                    __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *)&magic, sizeof(magic));
+    }
+
+    mpt_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.d_model,        sizeof(hparams.d_model));
+        finp.read((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
+        finp.read((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
+        finp.read((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
+        finp.read((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
+        finp.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
+        finp.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: d_model        = %d\n", __func__, hparams.d_model);
+        printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
+        printf("%s: n_heads        = %d\n", __func__, hparams.n_heads);
+        printf("%s: n_layers       = %d\n", __func__, hparams.n_layers);
+        printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
+        printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
+        printf("%s: clip_qkv       = %f\n", __func__, hparams.clip_qkv);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.d_model,        sizeof(hparams.d_model));
+        fout.write((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
+        fout.write((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
+        fout.write((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
+        fout.write((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
+        fout.write((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
+        fout.write((char *) &ftype_dst,              sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = hparams.n_vocab;
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read((char *)&len, sizeof(len));
+            fout.write((char *)&len, sizeof(len));
+
+            word.resize(len);
+            finp.read((char *)word.data(), len);
+            fout.write((char *)word.data(), len);
+        }
+    }
+
+    printf("%s: quantizing tensors\n", __func__);
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        ".*weight",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
+                fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./mpt-quantize models/mpt/ggml-model.bin
+//  models/mpt/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
+                argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = {0, NULL, false};
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n",
+                    __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__,
+               t_quantize_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__,
+               (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/dolly-v2.txt b/stable-diffusion.cpp/ggml/examples/prompts/dolly-v2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ecdb0b7a6e27e44ec06016568924bec7c67d3a9e
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/dolly-v2.txt
@@ -0,0 +1,100 @@
+Hello World! => 12092,3645,2
+I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476
+The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449
+"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574
+'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464
+"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449
+The book costs $19.99 => 510,1984,4815,370,746,15,1525
+"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449
+Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2
+C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32
+W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2
+H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32
+I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15
+Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32
+The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15
+I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15
+The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15
+She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15
+We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15
+He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15
+They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15
+The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15
+I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15
+We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15
+The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15
+She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15
+He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15
+The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15
+I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15
+The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15
+They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15
+She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15
+We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15
+He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15
+The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15
+I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15
+They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15
+She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15
+He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15
+The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15
+I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15
+They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15
+She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15
+We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15
+The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15
+He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15
+The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15
+I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15
+They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15
+She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15
+We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15
+The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15
+He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15
+The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15
+I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15
+They are renovating their house. => 3726,403,30074,839,616,2419,15
+She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15
+We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15
+The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15
+He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15
+The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15
+I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15
+They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15
+She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15
+We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15
+The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15
+He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15
+The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15
+I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15
+They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15
+She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15
+We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15
+The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15
+He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15
+The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15
+I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15
+They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15
+She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15
+We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15
+The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15
+He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15
+The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15
+I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15
+They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15
+She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15
+We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15
+The traffic is congest => 510,7137,310,25801
+The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15
+I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15
+She plays the piano beautifully. => 2993,7120,253,18542,27839,15
+The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15
+I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15
+He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15
+The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15
+She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15
+The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15
+We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15
+He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15
+The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15
+She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/gpt-2-chinese.txt b/stable-diffusion.cpp/ggml/examples/prompts/gpt-2-chinese.txt
new file mode 100644
index 0000000000000000000000000000000000000000..919829d8155b673104cbc32c0e857d50db83860a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/gpt-2-chinese.txt
@@ -0,0 +1 @@
+请问洗手间在哪里？ => 6435,7309,3819,2797,7313,1762,1525,7027,8043
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/gpt-2.txt b/stable-diffusion.cpp/ggml/examples/prompts/gpt-2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ed9310db39567f008e666d35189b0fe0ccd57a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/gpt-2.txt
@@ -0,0 +1,100 @@
+Hello World! => 15496,2159,0
+I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474
+The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526
+"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496
+'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637
+"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526
+The book costs $19.99 => 464,1492,3484,720,1129,13,2079
+"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526
+Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0
+C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30
+W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0
+H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30
+I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13
+Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30
+The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13
+I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13
+The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13
+She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13
+We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13
+He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13
+They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13
+The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13
+I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13
+We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13
+The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13
+She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13
+He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13
+The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13
+I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13
+The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13
+They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13
+She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13
+We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13
+He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13
+The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13
+I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13
+They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13
+She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13
+He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13
+The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13
+I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13
+They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13
+She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13
+We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13
+The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13
+He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13
+The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13
+I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13
+They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13
+She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13
+We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13
+The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13
+He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13
+The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13
+I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13
+They are renovating their house. => 2990,389,24317,803,511,2156,13
+She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13
+We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13
+The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13
+He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13
+The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13
+I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13
+They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13
+She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13
+We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13
+The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13
+He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13
+The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13
+I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13
+They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13
+She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13
+We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13
+The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13
+He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13
+The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13
+I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13
+They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13
+She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13
+We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13
+The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13
+He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13
+The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13
+I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13
+They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13
+She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13
+We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13
+The traffic is congest => 464,4979,318,22791
+The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13
+I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13
+She plays the piano beautifully. => 3347,5341,262,19132,21104,13
+The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13
+I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13
+He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13
+The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13
+She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13
+The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13
+We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13
+He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13
+The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13
+She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/gpt-j.txt b/stable-diffusion.cpp/ggml/examples/prompts/gpt-j.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ed9310db39567f008e666d35189b0fe0ccd57a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/gpt-j.txt
@@ -0,0 +1,100 @@
+Hello World! => 15496,2159,0
+I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474
+The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526
+"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496
+'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637
+"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526
+The book costs $19.99 => 464,1492,3484,720,1129,13,2079
+"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526
+Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0
+C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30
+W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0
+H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30
+I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13
+Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30
+The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13
+I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13
+The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13
+She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13
+We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13
+He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13
+They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13
+The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13
+I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13
+We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13
+The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13
+She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13
+He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13
+The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13
+I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13
+The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13
+They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13
+She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13
+We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13
+He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13
+The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13
+I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13
+They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13
+She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13
+He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13
+The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13
+I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13
+They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13
+She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13
+We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13
+The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13
+He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13
+The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13
+I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13
+They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13
+She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13
+We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13
+The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13
+He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13
+The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13
+I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13
+They are renovating their house. => 2990,389,24317,803,511,2156,13
+She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13
+We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13
+The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13
+He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13
+The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13
+I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13
+They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13
+She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13
+We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13
+The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13
+He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13
+The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13
+I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13
+They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13
+She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13
+We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13
+The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13
+He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13
+The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13
+I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13
+They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13
+She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13
+We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13
+The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13
+He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13
+The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13
+I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13
+They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13
+She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13
+We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13
+The traffic is congest => 464,4979,318,22791
+The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13
+I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13
+She plays the piano beautifully. => 3347,5341,262,19132,21104,13
+The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13
+I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13
+He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13
+The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13
+She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13
+The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13
+We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13
+He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13
+The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13
+She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/gpt-neox-japanese.txt b/stable-diffusion.cpp/ggml/examples/prompts/gpt-neox-japanese.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c39df160a5f12a679a532b141a63c4807bbe3888
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/gpt-neox-japanese.txt
@@ -0,0 +1 @@
+明日の天気はどうですか。 => 263,7353,268,18461,271,1722,18405,265
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/gpt-neox.txt b/stable-diffusion.cpp/ggml/examples/prompts/gpt-neox.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ecdb0b7a6e27e44ec06016568924bec7c67d3a9e
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/gpt-neox.txt
@@ -0,0 +1,100 @@
+Hello World! => 12092,3645,2
+I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476
+The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449
+"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574
+'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464
+"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449
+The book costs $19.99 => 510,1984,4815,370,746,15,1525
+"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449
+Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2
+C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32
+W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2
+H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32
+I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15
+Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32
+The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15
+I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15
+The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15
+She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15
+We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15
+He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15
+They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15
+The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15
+I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15
+We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15
+The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15
+She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15
+He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15
+The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15
+I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15
+The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15
+They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15
+She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15
+We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15
+He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15
+The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15
+I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15
+They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15
+She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15
+He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15
+The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15
+I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15
+They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15
+She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15
+We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15
+The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15
+He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15
+The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15
+I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15
+They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15
+She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15
+We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15
+The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15
+He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15
+The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15
+I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15
+They are renovating their house. => 3726,403,30074,839,616,2419,15
+She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15
+We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15
+The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15
+He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15
+The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15
+I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15
+They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15
+She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15
+We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15
+The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15
+He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15
+The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15
+I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15
+They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15
+She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15
+We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15
+The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15
+He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15
+The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15
+I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15
+They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15
+She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15
+We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15
+The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15
+He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15
+The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15
+I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15
+They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15
+She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15
+We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15
+The traffic is congest => 510,7137,310,25801
+The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15
+I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15
+She plays the piano beautifully. => 2993,7120,253,18542,27839,15
+The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15
+I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15
+He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15
+The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15
+She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15
+The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15
+We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15
+He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15
+The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15
+She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/polyglot-ko.txt b/stable-diffusion.cpp/ggml/examples/prompts/polyglot-ko.txt
new file mode 100644
index 0000000000000000000000000000000000000000..41fa0085b4a8102558fef09512e6a5f3a28e75d6
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/polyglot-ko.txt
@@ -0,0 +1,3 @@
+이것은 테스트 이다. => 12271,296,6474,28037,17
+걱정할 필요 없다. => 18311,482,1062,550,267,17
+버그는 언젠가 고쳐진다. => 6904,272,8575,10381,1765,17
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/replit.txt b/stable-diffusion.cpp/ggml/examples/prompts/replit.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7b5ffcf177bfddde986f522c72599f7ea92bdf4d
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/replit.txt
@@ -0,0 +1,100 @@
+Hello World! => 6466,147,2317,350
+I can't believe it's already Friday!" => 286,512,172,185,13392,393,172,155,3239,147,29249,8537
+The URL for the website is https://www.example.com." => 505,5635,250,170,11745,235,147,303,262,552,148,811,148,241,148,161
+"She said, 'I love to travel.'" => 161,10386,4089,150,206,286,8440,194,147,12363,148,172,161
+'The temperature is 25.5°C.' => 172,505,147,9502,235,147,20022,8516,228,148,172
+"Let's meet at 2:30 p.m. in the park." => 161,8997,172,155,17120,536,147,162,5245,147,207,148,204,148,219,170,147,17664,148,161
+The book costs $19.99 => 505,147,2277,17494,236,166,11824
+"John's favorite color is blue." => 161,7475,172,155,147,11105,147,349,235,17046,148,161
+Th@nk y0u f0r y0ur h3lp! => 6309,240,9019,147,237,159,247,147,202,159,223,147,237,159,2458,147,226,171,3899,350
+C@n I g3t a c0ffee, pl3@se? => 228,240,211,398,147,267,171,185,216,147,196,159,13360,163,150,147,1287,171,240,155,163,272
+W0w! Th@t's @m@zing! => 450,159,274,350,147,6309,240,185,172,155,268,204,240,301,248,350
+H0w 4re y0u t0d@y? => 304,159,274,320,440,147,237,159,247,147,185,159,182,240,237,272
+I l0ve t0 tr@vel @r0und the w0rld. => 286,997,159,1290,147,185,159,147,490,240,3893,268,223,159,3981,170,147,274,159,223,2833,148
+Wh@t's y0ur f@v0rite m0vie? => 450,226,240,185,172,155,147,237,159,2458,147,202,240,252,159,5961,163,147,204,159,24373,272
+The cat is sleeping on the mat. => 505,147,1604,235,147,3987,248,347,170,147,1297,148
+I need to buy some groceries for dinner. => 286,1645,194,147,8068,1499,147,10022,1037,10023,250,147,182,2749,148
+The sun is shining brightly in the sky. => 505,147,5852,235,147,7304,2967,147,215,649,391,219,170,147,7310,148
+She is reading a book in the park. => 10386,235,9838,216,147,2277,219,170,147,17664,148
+We went for a walk on the beach yesterday. => 3250,10825,250,216,147,8156,347,170,294,5371,147,28830,148
+He plays the guitar like a pro. => 5301,7084,155,170,147,4604,2214,1425,216,3474,148
+They are going to the movies tonight. => 18815,429,6552,194,170,147,15877,194,7907,148
+The flowers are blooming in the garden. => 505,147,22953,155,429,147,10411,2799,248,219,170,147,22140,148
+I enjoy listening to classical music. => 286,23162,15876,248,194,239,4251,147,7395,148
+We need to buy groceries for the week. => 3250,1645,194,147,8068,147,10022,1037,10023,250,170,9238,148
+The dog is chasing its tail in circles. => 505,147,6540,235,147,196,916,248,1602,147,5129,219,147,4095,155,148
+She is wearing a beautiful red dress. => 10386,235,147,16427,248,216,147,23447,147,1160,147,14592,148
+He is a talented actor in Hollywood. => 5301,235,216,147,29750,246,147,5112,219,147,16924,391,10477,148
+The children are playing in the playground. => 505,7934,429,7084,248,219,170,7084,12055,148
+I'm going to visit my grandparents this weekend. => 286,172,204,6552,194,9939,1247,147,11806,12019,291,9238,314,148
+The coffee tastes bitter without sugar. => 505,147,21526,147,20931,155,5145,1430,1988,147,28759,148
+They are planning a surprise party for her. => 18815,429,147,23661,216,147,29240,147,7344,250,1869,148
+She sings like an angel on stage. => 10386,147,155,6502,1425,426,147,26028,347,12685,148
+We should take a vacation to relax. => 3250,936,4654,216,147,15388,946,194,1998,2744,148
+He is studying medicine at the university. => 5301,235,7959,248,147,20742,1668,536,170,147,8025,148
+The rain is pouring heavily outside. => 505,147,6885,235,5306,248,1189,5451,391,8096,148
+I enjoy watching romantic movies. => 286,23162,147,3355,248,147,26080,4140,147,15877,148
+They are celebrating their anniversary today. => 18815,429,147,30000,5841,1669,147,24734,5464,1770,13386,148
+She dances gracefully to the music. => 10386,147,182,1626,155,147,267,8771,8001,194,170,147,7395,148
+He is an excellent basketball player. => 5301,235,426,147,12300,675,185,147,26646,5132,6294,148
+The baby is sleeping soundly in the crib. => 505,147,23597,235,147,3987,248,12642,391,219,170,147,7696,215,148
+I need to finish my homework before dinner. => 286,1645,194,147,6717,1247,147,1071,2722,2643,147,182,2749,148
+They are organizing a charity event next month. => 18815,429,147,16442,248,216,1054,1511,1663,2399,12821,148
+She is cooking a delicious meal for us. => 10386,235,147,20453,248,216,3936,23455,147,26658,250,147,539,148
+We should go hiking in the mountains. => 3250,936,4242,147,2254,5357,219,170,147,204,18028,155,148
+The car broke down on the way to work. => 505,7553,147,510,10036,4288,347,170,3699,194,1916,148
+He loves playing video games in his free time. => 5301,8440,155,7084,248,8722,147,11281,219,1439,4002,801,148
+The birds are chirping in the trees. => 505,147,13043,155,429,147,3904,223,4639,219,170,5311,155,148
+I want to learn how to play the piano. => 286,1857,194,14167,2496,194,7084,170,147,207,23635,148
+They are building a new shopping mall in the city. => 18815,429,11038,216,277,147,22184,147,204,609,219,170,147,2416,148
+She is writing a novel in her spare time. => 10386,235,3242,216,147,25814,219,1869,6772,2382,801,148
+We are going to the zoo this Saturday. => 3250,429,6552,194,170,147,25101,291,147,31426,148
+The cake looks delicious with chocolate frosting. => 505,147,24422,16303,3936,23455,312,147,5619,533,2239,147,202,3973,3431,148
+He is a talented painter who sells his artwork. => 5301,235,216,147,29750,246,147,9226,279,2888,13004,155,1439,12234,2722,148
+The students are studying for their exams. => 505,15707,429,7959,248,250,1669,147,12398,155,148
+I enjoy swimming in the ocean. => 286,23162,147,4729,8528,248,219,170,147,26193,148
+They are renovating their house. => 18815,429,991,10724,3643,1669,13788,148
+She is practicing yoga to stay healthy. => 10386,235,147,18453,248,147,5063,1186,194,15344,147,28550,148
+We should plant flowers in the garden. => 3250,936,147,9212,147,22953,155,219,170,147,22140,148
+The traffic is heavy during rush hour. => 505,147,11097,235,147,22232,4340,147,22319,147,5686,148
+He is a skilled chef who creates amazing dishes. => 5301,235,216,147,8891,246,9784,202,2888,13720,147,28880,147,23852,383,148
+The baby is crawling on the floor. => 505,147,23597,235,147,22120,248,347,170,147,5895,148
+I need to buy a new pair of shoes. => 286,1645,194,147,8068,216,277,12632,210,147,155,21953,155,148
+They are going on a road trip across the country. => 18815,429,6552,347,216,147,6362,147,11395,9762,170,11305,148
+She is playing the piano beautifully. => 10386,235,7084,248,170,147,207,23635,147,23447,391,148
+We are going to a concert tomorrow night. => 3250,429,6552,194,216,1710,4391,29524,12716,148
+The cake tastes delicious with vanilla frosting. => 505,147,24422,147,20931,155,3936,23455,312,5535,7476,147,202,3973,3431,148
+He is a dedicated teacher who inspires his students. => 5301,235,216,326,8298,3460,147,9675,2888,147,28801,155,1439,15707,148
+The students are participating in a science fair. => 505,15707,429,147,30961,3643,219,216,147,10587,147,7636,148
+I enjoy hiking in the mountains. => 286,23162,147,2254,5357,219,170,147,204,18028,155,148
+They are organizing a beach cleanup next weekend. => 18815,429,147,16442,248,216,294,5371,147,10401,2399,9238,314,148
+She is taking photographs of nature. => 10386,235,147,12345,147,4709,1547,155,210,147,211,8603,148
+We should try a new restaurant in town. => 3250,936,147,746,216,277,147,11007,219,147,10200,148
+The traffic is moving slowly on the highway. => 505,147,11097,235,147,8601,147,9880,391,347,170,5976,3330,148
+He is a talented singer with a beautiful voice. => 5301,235,216,147,29750,246,147,155,248,279,312,216,147,23447,147,9316,148
+The baby is laughing and giggling. => 505,147,23597,235,147,23066,248,221,147,2341,3631,2869,148
+I need to do laundry and wash my clothes. => 286,1645,194,543,960,3981,2154,221,147,27589,1247,147,22141,383,148
+They are planning a trip to Europe. => 18815,429,147,23661,216,147,11395,194,13131,148
+She is learning how to play the guitar. => 10386,235,11754,2496,194,7084,170,147,4604,2214,148
+We are going to a museum this Sunday. => 3250,429,6552,194,216,147,204,433,1177,291,147,29111,148
+The coffee smells amazing in the morning. => 505,147,21526,31454,155,147,28880,219,170,20701,148
+He is a hardworking farmer who grows crops. => 5301,235,216,8524,14992,147,16679,279,2888,147,6044,155,147,8650,155,148
+The students are presenting their research projects. => 505,15707,429,5130,248,1669,13217,14235,148
+I enjoy playing soccer with my friends. => 286,23162,7084,248,147,9351,5318,312,1247,147,5347,155,148
+They are volunteering at a local shelter. => 18815,429,147,5238,7478,163,12798,536,216,2491,2905,1359,279,148
+She is practicing martial arts for self-defense. => 10386,235,147,18453,248,147,3261,185,4381,12234,155,250,623,153,29896,148
+We should try a new recipe for dinner. => 3250,936,147,746,216,277,147,9851,250,147,182,2749,148
+The traffic is congest => 505,147,11097,235,1710,14169
+The sun is shining brightly today. => 505,147,5852,235,147,7304,2967,147,215,649,391,13386,148
+I enjoy reading books in my free time. => 286,23162,9838,147,9670,219,1247,4002,801,148
+She plays the piano beautifully. => 10386,7084,155,170,147,207,23635,147,23447,391,148
+The cat chased the mouse around the room. => 505,147,1604,147,196,916,246,170,12551,6890,170,9654,148
+I love eating pizza with extra cheese. => 286,8440,147,163,3643,147,207,8403,312,8230,9784,383,163,148
+He always wears a hat wherever he goes. => 5301,5418,147,16427,155,216,147,4879,2171,2433,1189,16177,148
+The flowers in the garden are blooming. => 505,147,22953,155,219,170,147,22140,429,147,10411,2799,248,148
+She danced gracefully on the stage. => 10386,13378,12408,147,267,8771,8001,347,170,12685,148
+The dog barked loudly in the park. => 505,147,6540,147,973,293,246,147,30182,391,219,170,147,17664,148
+We went swimming in the ocean yesterday. => 3250,10825,147,4729,8528,248,219,170,147,26193,147,28830,148
+He speaks fluent French and Spanish. => 5301,147,13285,155,147,21677,147,254,17590,221,147,31519,148
+The train arrived at the station on time. => 505,147,872,147,20712,182,536,170,147,7184,347,801,148
+She cooked a delicious meal for her family. => 10386,147,20453,246,216,3936,23455,147,26658,250,1869,147,2002,148
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/starcoder.txt b/stable-diffusion.cpp/ggml/examples/prompts/starcoder.txt
new file mode 100644
index 0000000000000000000000000000000000000000..03a5b22151692f3edb496de0ce45bc12b13d9d9f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/starcoder.txt
@@ -0,0 +1,100 @@
+Hello World! => 8279,10896,19
+I can't believe it's already Friday!" => 59,883,1330,13710,561,1182,3425,506,25674,11555
+The URL for the website is https://www.example.com." => 1318,3834,436,322,9575,438,1678,555,1499,32,2763,32,508,3107
+"She said, 'I love to travel.'" => 20,25387,9884,30,330,59,14290,372,25283,29329
+'The temperature is 25.5°C.' => 25,1318,13587,438,225,36,39,32,39,23767,53,4564
+"Let's meet at 2:30 p.m. in the park." => 20,9809,1182,18450,821,225,36,44,37,34,298,32,95,32,328,322,880,93,3107
+The book costs $19.99 => 1318,7618,25950,398,35,43,32,43,43
+"John's favorite color is blue." => 20,19693,1182,27448,1963,438,10087,3107
+Th@nk y0u f0r y0ur h3lp! => 1027,50,19877,533,34,103,296,34,100,533,34,305,420,37,1915,19
+C@n I g3t a c0ffee, pl3@se? => 53,50,96,439,485,37,102,312,281,34,21298,30,1278,37,50,277,49
+W0w! Th@t's @m@zing! => 73,34,105,19,947,50,102,1182,477,95,50,26768,19
+H0w 4re y0u t0d@y? => 58,34,105,225,38,268,533,34,103,273,34,86,50,107,49
+I l0ve t0 tr@vel @r0und the w0rld. => 59,456,34,587,273,34,554,50,1203,477,100,34,642,322,341,34,100,1381,32
+Wh@t's y0ur f@v0rite m0vie? => 2444,50,102,1182,533,34,305,296,50,104,34,1049,345,34,104,1075,49
+The cat is sleeping on the mat. => 1318,10501,438,9368,299,544,322,2491,32
+I need to buy some groceries for dinner. => 59,1849,372,16968,1629,20234,85,6958,436,343,3369,32
+The sun is shining brightly in the sky. => 1318,15323,438,787,19068,38231,631,328,322,26718,32
+She is reading a book in the park. => 25387,438,9175,312,7618,328,322,880,93,32
+We went for a walk on the beach yesterday. => 3122,14236,436,312,13503,544,322,526,867,39485,32
+He plays the guitar like a pro. => 1331,41271,322,3932,19931,2124,312,534,32
+They are going to the movies tonight. => 31805,884,6783,372,322,27889,26076,694,32
+The flowers are blooming in the garden. => 1318,7290,483,884,323,18466,299,328,322,485,22461,32
+I enjoy listening to classical music. => 59,31567,20498,372,443,1578,17522,32
+We need to buy groceries for the week. => 3122,1849,372,16968,20234,85,6958,436,322,8209,32
+The dog is chasing its tail in circles. => 1318,27435,438,663,9949,2819,13203,328,46428,32
+She is wearing a beautiful red dress. => 25387,438,996,6992,312,36493,3346,343,714,32
+He is a talented actor in Hollywood. => 1331,438,312,273,9556,318,16038,328,48228,631,21118,32
+The children are playing in the playground. => 1318,5713,884,19788,328,322,4654,1749,32
+I'm going to visit my grandparents this weekend. => 59,3464,6783,372,7725,1672,33162,19277,458,40618,32
+The coffee tastes bitter without sugar. => 1318,36917,273,633,307,3493,391,2876,309,18628,32
+They are planning a surprise party for her. => 31805,884,26116,312,6178,9251,15270,436,7791,32
+She sings like an angel on stage. => 25387,309,2052,2124,600,600,17691,544,10019,32
+We should take a vacation to relax. => 3122,1395,4818,312,29164,367,372,41972,32
+He is studying medicine at the university. => 1331,438,14866,299,32388,482,821,322,707,9190,32
+The rain is pouring heavily outside. => 1318,36987,438,9202,299,46003,2801,11127,32
+I enjoy watching romantic movies. => 59,31567,37652,26045,7268,27889,32
+They are celebrating their anniversary today. => 31805,884,48278,839,1741,3623,23921,5810,672,11610,32
+She dances gracefully to the music. => 25387,343,3151,31376,4938,372,322,17522,32
+He is an excellent basketball player. => 1331,438,600,39203,48400,11653,4362,32
+The baby is sleeping soundly in the crib. => 1318,323,17156,438,9368,299,9934,631,328,322,281,7972,32
+I need to finish my homework before dinner. => 59,1849,372,11361,1672,6765,1007,2670,343,3369,32
+They are organizing a charity event next month. => 31805,884,10558,6183,312,1351,543,1692,2354,6811,32
+She is cooking a delicious meal for us. => 25387,438,23682,299,312,409,406,2406,597,279,436,1770,32
+We should go hiking in the mountains. => 3122,1395,1983,420,1546,299,328,322,10874,1907,32
+The car broke down on the way to work. => 1318,6346,43289,2835,544,322,3352,372,1389,32
+He loves playing video games in his free time. => 1331,598,4954,19788,6027,19705,328,6697,3741,1133,32
+The birds are chirping in the trees. => 1318,8424,3210,884,663,476,7075,328,322,23453,32
+I want to learn how to play the piano. => 59,2637,372,7350,2624,372,4654,322,298,25757,32
+They are building a new shopping mall in the city. => 31805,884,9038,312,537,40692,345,464,328,322,11297,32
+She is writing a novel in her spare time. => 25387,438,4127,312,32913,328,7791,1869,586,1133,32
+We are going to the zoo this Saturday. => 3122,884,6783,372,322,1288,604,458,358,30288,32
+The cake looks delicious with chocolate frosting. => 1318,281,1062,7780,409,406,2406,623,10408,27589,296,20932,299,32
+He is a talented painter who sells his artwork. => 1331,438,312,273,9556,318,42300,6560,10800,101,6697,5549,1007,32
+The students are studying for their exams. => 1318,16512,884,14866,299,436,3623,538,1462,32
+I enjoy swimming in the ocean. => 59,31567,2535,449,6714,328,322,337,18857,32
+They are renovating their house. => 31805,884,316,15007,1741,3623,17075,32
+She is practicing yoga to stay healthy. => 25387,438,11808,11636,533,40067,372,20005,44538,32
+We should plant flowers in the garden. => 3122,1395,26795,7290,483,328,322,485,22461,32
+The traffic is heavy during rush hour. => 1318,16391,438,32389,5929,540,1372,12021,32
+He is a skilled chef who creates amazing dishes. => 1331,438,312,3001,12088,44051,6560,9585,36986,1214,4279,32
+The baby is crawling on the floor. => 1318,323,17156,438,281,1294,2920,544,322,17648,32
+I need to buy a new pair of shoes. => 59,1849,372,16968,312,537,6092,432,787,37764,32
+They are going on a road trip across the country. => 31805,884,6783,544,312,24122,19337,10160,322,10769,32
+She is playing the piano beautifully. => 25387,438,19788,322,298,25757,526,4846,325,514,107,32
+We are going to a concert tomorrow night. => 3122,884,6783,372,312,457,6989,31841,19212,32
+The cake tastes delicious with vanilla frosting. => 1318,281,1062,273,633,307,409,406,2406,623,44653,296,20932,299,32
+He is a dedicated teacher who inspires his students. => 1331,438,312,23112,30877,6560,26194,8017,6697,16512,32
+The students are participating in a science fair. => 1318,16512,884,24623,1741,328,312,27536,19375,32
+I enjoy hiking in the mountains. => 59,31567,420,1546,299,328,322,10874,1907,32
+They are organizing a beach cleanup next weekend. => 31805,884,10558,6183,312,526,867,13144,2354,40618,32
+She is taking photographs of nature. => 25387,438,15137,15110,23626,432,24406,32
+We should try a new restaurant in town. => 3122,1395,1596,312,537,43719,328,38212,32
+The traffic is moving slowly on the highway. => 1318,16391,438,14089,12899,631,544,322,3857,3073,32
+He is a talented singer with a beautiful voice. => 1331,438,312,273,9556,318,309,10118,623,312,36493,20309,32
+The baby is laughing and giggling. => 1318,323,17156,438,2317,2943,299,461,485,365,36088,32
+I need to do laundry and wash my clothes. => 59,1849,372,745,2317,642,994,461,341,917,1672,7375,46948,32
+They are planning a trip to Europe. => 31805,884,26116,312,19337,372,27268,32
+She is learning how to play the guitar. => 25387,438,9608,2624,372,4654,322,3932,19931,32
+We are going to a museum this Sunday. => 3122,884,6783,372,312,345,539,378,458,358,28036,32
+The coffee smells amazing in the morning. => 1318,36917,309,42153,101,36986,328,322,33768,32
+He is a hardworking farmer who grows crops. => 1331,438,312,6784,13578,9019,2302,6560,485,2138,25170,1069,32
+The students are presenting their research projects. => 1318,16512,884,5024,299,3623,13234,8528,32
+I enjoy playing soccer with my friends. => 59,31567,19788,22682,10035,623,1672,22523,32
+They are volunteering at a local shelter. => 31805,884,3920,45585,8637,821,312,2196,309,2542,391,32
+She is practicing martial arts for self-defense. => 25387,438,11808,11636,345,502,564,5549,101,436,630,31,43694,32
+We should try a new recipe for dinner. => 3122,1395,1596,312,537,15233,436,343,3369,32
+The traffic is congest => 1318,16391,438,457,2776
+The sun is shining brightly today. => 1318,15323,438,787,19068,38231,631,11610,32
+I enjoy reading books in my free time. => 59,31567,9175,21739,328,1672,3741,1133,32
+She plays the piano beautifully. => 25387,41271,322,298,25757,526,4846,325,514,107,32
+The cat chased the mouse around the room. => 1318,10501,663,16109,322,8459,6835,322,8355,32
+I love eating pizza with extra cheese. => 59,14290,484,1741,47630,623,6717,8277,30315,32
+He always wears a hat wherever he goes. => 1331,5182,996,4177,312,25793,2154,424,938,13107,32
+The flowers in the garden are blooming. => 1318,7290,483,328,322,485,22461,884,323,18466,299,32
+She danced gracefully on the stage. => 25387,343,6087,31376,4938,544,322,10019,32
+The dog barked loudly in the park. => 1318,27435,323,1087,318,598,836,631,328,322,880,93,32
+We went swimming in the ocean yesterday. => 3122,14236,2535,449,6714,328,322,337,18857,39485,32
+He speaks fluent French and Spanish. => 1331,24498,101,38055,43652,461,14911,1708,32
+The train arrived at the station on time. => 1318,5683,2099,32114,821,322,18662,544,1133,32
+She cooked a delicious meal for her family. => 25387,23682,318,312,409,406,2406,597,279,436,7791,13872,32
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/test-cases.txt b/stable-diffusion.cpp/ggml/examples/prompts/test-cases.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d0bdbf9937b7494e3984666a1da723c63ed3132
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/test-cases.txt
@@ -0,0 +1,110 @@
+# test case format
+# <language>: <sentence>
+
+English: Hello World!
+English: I can't believe it's already Friday!"
+English: The URL for the website is https://www.example.com."
+English: "She said, 'I love to travel.'"
+English: 'The temperature is 25.5°C.'
+English: "Let's meet at 2:30 p.m. in the park."
+English: The book costs $19.99
+English: "John's favorite color is blue."
+English: Th@nk y0u f0r y0ur h3lp!
+English: C@n I g3t a c0ffee, pl3@se?
+English: W0w! Th@t's @m@zing!
+English: H0w 4re y0u t0d@y?
+English: I l0ve t0 tr@vel @r0und the w0rld.
+English: Wh@t's y0ur f@v0rite m0vie?
+English: The cat is sleeping on the mat.
+English: I need to buy some groceries for dinner.
+English: The sun is shining brightly in the sky.
+English: She is reading a book in the park.
+English: We went for a walk on the beach yesterday.
+English: He plays the guitar like a pro.
+English: They are going to the movies tonight.
+English: The flowers are blooming in the garden.
+English: I enjoy listening to classical music.
+English: We need to buy groceries for the week.
+English: The dog is chasing its tail in circles.
+English: She is wearing a beautiful red dress.
+English: He is a talented actor in Hollywood.
+English: The children are playing in the playground.
+English: I'm going to visit my grandparents this weekend.
+English: The coffee tastes bitter without sugar.
+English: They are planning a surprise party for her.
+English: She sings like an angel on stage.
+English: We should take a vacation to relax.
+English: He is studying medicine at the university.
+English: The rain is pouring heavily outside.
+English: I enjoy watching romantic movies.
+English: They are celebrating their anniversary today.
+English: She dances gracefully to the music.
+English: He is an excellent basketball player.
+English: The baby is sleeping soundly in the crib.
+English: I need to finish my homework before dinner.
+English: They are organizing a charity event next month.
+English: She is cooking a delicious meal for us.
+English: We should go hiking in the mountains.
+English: The car broke down on the way to work.
+English: He loves playing video games in his free time.
+English: The birds are chirping in the trees.
+English: I want to learn how to play the piano.
+English: They are building a new shopping mall in the city.
+English: She is writing a novel in her spare time.
+English: We are going to the zoo this Saturday.
+English: The cake looks delicious with chocolate frosting.
+English: He is a talented painter who sells his artwork.
+English: The students are studying for their exams.
+English: I enjoy swimming in the ocean.
+English: They are renovating their house.
+English: She is practicing yoga to stay healthy.
+English: We should plant flowers in the garden.
+English: The traffic is heavy during rush hour.
+English: He is a skilled chef who creates amazing dishes.
+English: The baby is crawling on the floor.
+English: I need to buy a new pair of shoes.
+English: They are going on a road trip across the country.
+English: She is playing the piano beautifully.
+English: We are going to a concert tomorrow night.
+English: The cake tastes delicious with vanilla frosting.
+English: He is a dedicated teacher who inspires his students.
+English: The students are participating in a science fair.
+English: I enjoy hiking in the mountains.
+English: They are organizing a beach cleanup next weekend.
+English: She is taking photographs of nature.
+English: We should try a new restaurant in town.
+English: The traffic is moving slowly on the highway.
+English: He is a talented singer with a beautiful voice.
+English: The baby is laughing and giggling.
+English: I need to do laundry and wash my clothes.
+English: They are planning a trip to Europe.
+English: She is learning how to play the guitar.
+English: We are going to a museum this Sunday.
+English: The coffee smells amazing in the morning.
+English: He is a hardworking farmer who grows crops.
+English: The students are presenting their research projects.
+English: I enjoy playing soccer with my friends.
+English: They are volunteering at a local shelter.
+English: She is practicing martial arts for self-defense.
+English: We should try a new recipe for dinner.
+English: The traffic is congest
+English: The sun is shining brightly today.
+English: I enjoy reading books in my free time.
+English: She plays the piano beautifully.
+English: The cat chased the mouse around the room.
+English: I love eating pizza with extra cheese.
+English: He always wears a hat wherever he goes.
+English: The flowers in the garden are blooming.
+English: She danced gracefully on the stage.
+English: The dog barked loudly in the park.
+English: We went swimming in the ocean yesterday.
+English: He speaks fluent French and Spanish.
+English: The train arrived at the station on time.
+English: She cooked a delicious meal for her family.
+Korean: 이것은 테스트 이다.
+Korean: 걱정할 필요 없다.
+Korean: 버그는 언젠가 고쳐진다.
+Japanese: 明日の天気はどうですか。
+Chinese: 请问洗手间在哪里？
+Emoji: I'm feeling 😄 today!
+Unicode: ◑ ▢ ▣ ◱
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/tokenize_huggingface.py b/stable-diffusion.cpp/ggml/examples/prompts/tokenize_huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..627771fbe9a39fc94bd5805b4dbbfd7ac1731874
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/tokenize_huggingface.py
@@ -0,0 +1,65 @@
+import os
+from transformers import AutoTokenizer
+
+os.environ['TOKENIZERS_PARALLELISM'] = "false"
+
+list_repo_hf  = ["databricks/dolly-v2-3b",           # dolly-v2 (3b, 7b, 12b models share the same tokenizer)
+                 "gpt2",                             # gpt-2 (gpt2-xl, gpt2-large share the same tokenizer)
+                 "uer/gpt2-chinese-cluecorpussmall", # gpt-2-chinese
+                 "EleutherAI/gpt-j-6b",              # gpt-j
+                 "EleutherAI/gpt-neox-20b",          # gpt-neox
+                 "EleutherAI/polyglot-ko-1.3b",      # gpt-neox (polyglot-ko 5.8b and 12.8b share the same tokenizer")
+                 "rinna/japanese-gpt-neox-3.6b",     # gpt-neox
+                 # mpt-7b (uses gpt-neox-20b tokenizer)
+                 "replit/replit-code-v1-3b",         # replit
+                 "bigcode/starcoder",                # starcoder (huggingface-cli login required)
+                 "openai/whisper-tiny"               # whisper (base, large, large-v2 share the same tokenizer)
+                 ]
+
+repo2ggml     = {"databricks/dolly-v2-3b"           : "dolly-v2",
+                 "gpt2"                             : "gpt-2",
+                 "uer/gpt2-chinese-cluecorpussmall" : "gpt-2-chinese",
+                 "EleutherAI/gpt-j-6b"              : "gpt-j",
+                 "EleutherAI/gpt-neox-20b"          : "gpt-neox",
+                 "EleutherAI/polyglot-ko-1.3b"      : "polyglot-ko",
+                 "rinna/japanese-gpt-neox-3.6b"     : "gpt-neox-japanese",
+                 "replit/replit-code-v1-3b"         : "replit",
+                 "bigcode/starcoder"                : "starcoder",
+                 "openai/whisper-tiny"              : "whisper"}
+
+repo2language = {"databricks/dolly-v2-3b"           : "english",
+                 "gpt2"                             : "english",
+                 "uer/gpt2-chinese-cluecorpussmall" : "chinese",
+                 "EleutherAI/gpt-j-6b"              : "english",
+                 "EleutherAI/gpt-neox-20b"          : "english",
+                 "EleutherAI/polyglot-ko-1.3b"      : "korean",
+                 "rinna/japanese-gpt-neox-3.6b"     : "japanese",
+                 "replit/replit-code-v1-3b"         : "english",
+                 "bigcode/starcoder"                : "english",
+                 "openai/whisper-tiny"              : "english"}
+
+delimeter = ": "
+test_sentences = []
+with open("test-cases.txt", "r") as f:
+    lines = [l.rstrip() for l in f.readlines()]
+    for l in lines:
+        if delimeter in l:
+            language = l[:l.index(delimeter)]
+            sentence = l[l.index(delimeter) + len(delimeter):]
+            test_sentences.append((language.lower(), sentence))
+
+for repo in list_repo_hf:
+
+    target_language = repo2language[repo]
+
+    tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
+
+    tokens_hf = []
+    for language, sentence in test_sentences:
+        if language == target_language:
+            tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
+            tokens_hf.append((sentence, tokens))
+
+    save_txt = repo2ggml[repo] + ".txt"
+    with open(save_txt, "w") as f:
+        f.writelines([sentence + " => " + ",".join(str(t) for t in tokens) + "\n" for sentence, tokens in tokens_hf])
diff --git a/stable-diffusion.cpp/ggml/examples/prompts/whisper.txt b/stable-diffusion.cpp/ggml/examples/prompts/whisper.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a8f1caafb986ad4d6b2deb6ae6ac4b09f84c1e7f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/prompts/whisper.txt
@@ -0,0 +1,100 @@
+Hello World! => 15947,3937,0
+I can't believe it's already Friday!" => 40,393,380,1697,309,311,1217,6984,2963
+The URL for the website is https://www.example.com." => 2278,12905,337,220,3322,3144,307,34426,21492,17919,13,3121,335,781,13,1112,889
+"She said, 'I love to travel.'" => 1,9526,848,11,922,40,959,220,1353,220,17227,779,28763
+'The temperature is 25.5°C.' => 6,2278,220,18275,610,1503,307,3552,13,20,11782,34,4443
+"Let's meet at 2:30 p.m. in the park." => 1,8373,311,1677,412,568,25,3446,280,13,76,13,294,220,3322,3884,889
+The book costs $19.99 => 2278,1446,5497,1848,3405,13,8494
+"John's favorite color is blue." => 1,16938,311,2954,2017,307,3344,889
+Th@nk y0u f0r y0ur h3lp! => 2434,31,77,74,288,15,84,283,15,81,288,15,374,276,18,75,79,0
+C@n I g3t a c0ffee, pl3@se? => 34,31,77,286,290,18,83,257,269,15,4617,11,499,18,31,405,30
+W0w! Th@t's @m@zing! => 54,15,86,0,334,31,83,311,10428,76,31,8781,0
+H0w 4re y0u t0d@y? => 39,15,86,1017,265,288,15,84,220,83,15,67,31,88,30
+I l0ve t0 tr@vel @r0und the w0rld. => 40,287,15,303,220,83,15,220,6903,31,779,10428,81,15,997,220,3322,261,15,81,348,13
+Wh@t's y0ur f@v0rite m0vie? => 2471,31,83,311,288,15,374,283,31,85,15,35002,275,15,12702,30
+The cat is sleeping on the mat. => 2278,3857,307,8296,322,220,3322,3803,13
+I need to buy some groceries for dinner. => 40,643,220,1353,2256,512,31391,337,6148,13
+The sun is shining brightly in the sky. => 2278,3295,307,18269,47418,294,220,3322,5443,13
+She is reading a book in the park. => 9526,307,3760,257,1446,294,220,3322,3884,13
+We went for a walk on the beach yesterday. => 4360,1437,337,257,1792,322,220,3322,7534,5186,13
+He plays the guitar like a pro. => 5205,5749,220,3322,7531,411,257,447,13
+They are going to the movies tonight. => 8829,366,516,220,1353,220,3322,6233,220,1756,397,13
+The flowers are blooming in the garden. => 2278,8085,366,45294,294,220,3322,7431,13
+I enjoy listening to classical music. => 40,2103,4764,220,1353,13735,1318,13
+We need to buy groceries for the week. => 4360,643,220,1353,2256,31391,337,220,3322,1243,13
+The dog is chasing its tail in circles. => 2278,3000,307,17876,1080,220,14430,294,13040,13
+She is wearing a beautiful red dress. => 9526,307,4769,257,2238,2182,5231,13
+He is a talented actor in Hollywood. => 5205,307,257,220,32831,6003,8747,294,11628,13
+The children are playing in the playground. => 2278,2227,366,2433,294,220,3322,24646,13
+I'm going to visit my grandparents this weekend. => 40,478,516,220,1353,3441,452,21876,220,11176,6711,13
+The coffee tastes bitter without sugar. => 2278,4982,220,83,40246,13871,1553,5076,13
+They are planning a surprise party for her. => 8829,366,5038,257,6365,3595,337,720,13
+She sings like an angel on stage. => 9526,23250,411,364,14250,322,3233,13
+We should take a vacation to relax. => 4360,820,220,27612,257,12830,220,1353,5789,13
+He is studying medicine at the university. => 5205,307,7601,7195,412,220,3322,5454,13
+The rain is pouring heavily outside. => 2278,4830,307,20450,10950,2380,13
+I enjoy watching romantic movies. => 40,2103,1976,13590,6233,13
+They are celebrating their anniversary today. => 8829,366,15252,220,3322,347,12962,220,83,378,320,13
+She dances gracefully to the music. => 9526,28322,10042,2277,220,1353,220,3322,1318,13
+He is an excellent basketball player. => 5205,307,364,7103,11767,4256,13
+The baby is sleeping soundly in the crib. => 2278,3186,307,8296,1626,356,294,220,3322,47163,13
+I need to finish my homework before dinner. => 40,643,220,1353,2413,452,14578,949,6148,13
+They are organizing a charity event next month. => 8829,366,17608,257,16863,2280,958,1618,13
+She is cooking a delicious meal for us. => 9526,307,6361,257,4809,6791,337,505,13
+We should go hiking in the mountains. => 4360,820,352,23784,294,220,3322,10233,13
+The car broke down on the way to work. => 2278,1032,6902,760,322,220,3322,636,220,1353,589,13
+He loves playing video games in his free time. => 5205,6752,2433,960,2813,294,702,1737,220,3766,13
+The birds are chirping in the trees. => 2278,9009,366,36682,294,220,3322,220,3599,279,13
+I want to learn how to play the piano. => 40,528,220,1353,1466,577,220,1353,862,220,3322,9211,13
+They are building a new shopping mall in the city. => 8829,366,2390,257,777,8688,16026,294,220,3322,2307,13
+She is writing a novel in her spare time. => 9526,307,3579,257,7613,294,720,13798,220,3766,13
+We are going to the zoo this Saturday. => 4360,366,516,220,1353,220,3322,25347,220,11176,8803,13
+The cake looks delicious with chocolate frosting. => 2278,5908,1542,4809,365,6215,37048,13
+He is a talented painter who sells his artwork. => 5205,307,257,220,32831,6003,26619,567,20897,702,15829,13
+The students are studying for their exams. => 2278,1731,366,7601,337,220,3322,347,20514,13
+I enjoy swimming in the ocean. => 40,2103,11989,294,220,3322,7810,13
+They are renovating their house. => 8829,366,18845,990,220,3322,347,1782,13
+She is practicing yoga to stay healthy. => 9526,307,11350,15128,220,1353,1754,4627,13
+We should plant flowers in the garden. => 4360,820,3709,8085,294,220,3322,7431,13
+The traffic is heavy during rush hour. => 2278,220,17227,3341,307,4676,1830,9300,1773,13
+He is a skilled chef who creates amazing dishes. => 5205,307,257,19690,10530,567,7829,2243,10814,13
+The baby is crawling on the floor. => 2278,3186,307,32979,322,220,3322,4123,13
+I need to buy a new pair of shoes. => 40,643,220,1353,2256,257,777,6119,295,6654,13
+They are going on a road trip across the country. => 8829,366,516,322,257,3060,220,83,8400,2108,220,3322,1941,13
+She is playing the piano beautifully. => 9526,307,2433,220,3322,9211,16525,13
+We are going to a concert tomorrow night. => 4360,366,516,220,1353,257,8543,220,83,298,3162,1818,13
+The cake tastes delicious with vanilla frosting. => 2278,5908,220,83,40246,4809,365,17528,37048,13
+He is a dedicated teacher who inspires his students. => 5205,307,257,8374,220,975,4062,567,32566,702,1731,13
+The students are participating in a science fair. => 2278,1731,366,13950,294,257,3497,3143,13
+I enjoy hiking in the mountains. => 40,2103,23784,294,220,3322,10233,13
+They are organizing a beach cleanup next weekend. => 8829,366,17608,257,7534,40991,958,6711,13
+She is taking photographs of nature. => 9526,307,220,48625,17649,295,3687,13
+We should try a new restaurant in town. => 4360,820,220,83,627,257,777,6383,294,220,30401,13
+The traffic is moving slowly on the highway. => 2278,220,17227,3341,307,2684,5692,322,220,3322,17205,13
+He is a talented singer with a beautiful voice. => 5205,307,257,220,32831,6003,11564,365,257,2238,3177,13
+The baby is laughing and giggling. => 2278,3186,307,5059,293,290,24542,13
+I need to do laundry and wash my clothes. => 40,643,220,1353,360,19811,293,5675,452,5534,13
+They are planning a trip to Europe. => 8829,366,5038,257,220,83,8400,220,1353,3315,13
+She is learning how to play the guitar. => 9526,307,2539,577,220,1353,862,220,3322,7531,13
+We are going to a museum this Sunday. => 4360,366,516,220,1353,257,8441,220,11176,7776,13
+The coffee smells amazing in the morning. => 2278,4982,10036,2243,294,220,3322,2446,13
+He is a hardworking farmer who grows crops. => 5205,307,257,1152,22475,17891,567,13156,16829,13
+The students are presenting their research projects. => 2278,1731,366,15578,220,3322,347,2132,4455,13
+I enjoy playing soccer with my friends. => 40,2103,2433,15469,365,452,1855,13
+They are volunteering at a local shelter. => 8829,366,33237,412,257,2654,13341,13
+She is practicing martial arts for self-defense. => 9526,307,11350,20755,8609,337,2698,12,49268,13
+We should try a new recipe for dinner. => 4360,820,220,83,627,257,777,6782,337,6148,13
+The traffic is congest => 2278,220,17227,3341,307,31871
+The sun is shining brightly today. => 2278,3295,307,18269,47418,220,83,378,320,13
+I enjoy reading books in my free time. => 40,2103,3760,3642,294,452,1737,220,3766,13
+She plays the piano beautifully. => 9526,5749,220,3322,9211,16525,13
+The cat chased the mouse around the room. => 2278,3857,33091,220,3322,9719,926,220,3322,1808,13
+I love eating pizza with extra cheese. => 40,959,3936,8298,365,2857,5399,13
+He always wears a hat wherever he goes. => 5205,1009,20877,257,2385,8660,415,1709,13
+The flowers in the garden are blooming. => 2278,8085,294,220,3322,7431,366,45294,13
+She danced gracefully on the stage. => 9526,32909,10042,2277,322,220,3322,3233,13
+The dog barked loudly in the park. => 2278,3000,16202,292,22958,294,220,3322,3884,13
+We went swimming in the ocean yesterday. => 4360,1437,11989,294,220,3322,7810,5186,13
+He speaks fluent French and Spanish. => 5205,10789,40799,5522,293,8058,13
+The train arrived at the station on time. => 2278,220,83,7146,6678,412,220,3322,5214,322,220,3766,13
+She cooked a delicious meal for her family. => 9526,9267,257,4809,6791,337,720,1605,13
diff --git a/stable-diffusion.cpp/ggml/examples/python/README.md b/stable-diffusion.cpp/ggml/examples/python/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..480920f74cb28abee40027d5a48c74df609b5df6
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/README.md
@@ -0,0 +1,115 @@
+# Simple autogenerated Python bindings for ggml
+
+This folder contains:
+
+- Scripts to generate full Python bindings from ggml headers (+ stubs for autocompletion in IDEs)
+- Some barebones utils (see [ggml/utils.py](./ggml/utils.py)):
+  - `ggml.utils.init` builds a context that's freed automatically when the pointer gets GC'd
+  - `ggml.utils.copy` **copies between same-shaped tensors (numpy or ggml), w/ automatic (de/re)quantization**
+  - `ggml.utils.numpy` returns a numpy view over a ggml tensor; if it's quantized, it returns a copy (requires `allow_copy=True`)
+- Very basic examples (anyone wants to port [llama2.c](https://github.com/karpathy/llama2.c)?)
+
+Provided you set `GGML_LIBRARY=.../path/to/libggml_shared.so` (see instructions below), it's trivial to do some operations on quantized tensors:
+
+```python
+# Make sure libllama.so is in your [DY]LD_LIBRARY_PATH, or set GGML_LIBRARY=.../libggml_shared.so
+
+from ggml import lib, ffi
+from ggml.utils import init, copy, numpy
+import numpy as np
+
+ctx = init(mem_size=12*1024*1024)
+n = 256
+n_threads = 4
+
+a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
+b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) # Can't both be quantized
+sum = lib.ggml_add(ctx, a, b) # all zeroes for now. Will be quantized too!
+
+gf = ffi.new('struct ggml_cgraph*')
+lib.ggml_build_forward_expand(gf, sum)
+
+copy(np.array([i for i in range(n)], np.float32), a)
+copy(np.array([i*100 for i in range(n)], np.float32), b)
+
+lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads)
+
+print(numpy(a, allow_copy=True))
+#  0.    1.0439453   2.0878906   3.131836    4.1757812   5.2197266. ...
+print(numpy(b))
+#  0.  100.        200.        300.        400.        500.         ...
+print(numpy(sum, allow_copy=True))
+#  0.  105.4375    210.875     316.3125    421.75      527.1875     ...
+```
+
+### Prerequisites
+
+You'll need a shared library of ggml to use the bindings.
+
+#### Build libggml_shared.so or libllama.so
+
+As of this writing the best is to use [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)'s generated `libggml_shared.so` or `libllama.so`, which you can build as follows:
+
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+# On a CUDA-enabled system add -DLLAMA_CUBLAS=1
+# On a Mac add -DLLAMA_METAL=1
+cmake llama.cpp \
+  -B llama_build \
+  -DCMAKE_C_FLAGS=-Ofast \
+  -DLLAMA_NATIVE=1 \
+  -DLLAMA_LTO=1 \
+  -DBUILD_SHARED_LIBS=1 \
+  -DLLAMA_MPI=1 \
+  -DLLAMA_BUILD_TESTS=0 \
+  -DLLAMA_BUILD_EXAMPLES=0
+( cd llama_build && make -j )
+
+# On Mac, this will be libggml_shared.dylib instead
+export GGML_LIBRARY=$PWD/llama_build/libggml_shared.so
+# Alternatively, you can just copy it to your system's lib dir, e.g /usr/local/lib
+```
+
+#### (Optional) Regenerate the bindings and stubs
+
+If you added or changed any signatures of the C API, you'll want to regenerate the bindings ([ggml/cffi.py](./ggml/cffi.py)) and stubs ([ggml/__init__.pyi](./ggml/__init__.pyi)).
+
+Luckily it's a one-liner using [regenerate.py](./regenerate.py):
+
+```bash
+pip install -q cffi
+
+python regenerate.py
+```
+
+By default it assumes `llama.cpp` was cloned in ../../../llama.cpp (alongside the ggml folder). You can override this with:
+
+```bash
+C_INCLUDE_DIR=$LLAMA_CPP_DIR python regenerate.py
+```
+
+You can also edit [api.h](./api.h) to control which files should be included in the generated bindings (defaults to `llama.cpp/ggml*.h`)
+
+In fact, if you wanted to only generate bindings for the current version of the `ggml` repo itself (instead of `llama.cpp`; you'd loose support for k-quants), you could run:
+
+```bash
+API=../../include/ggml/ggml.h python regenerate.py
+```
+
+## Develop
+
+Run tests:
+
+```bash
+pytest
+```
+
+### Alternatives
+
+This example's goal is to showcase [cffi](https://cffi.readthedocs.io/)-generated bindings that are trivial to use and update, but there are already alternatives in the wild:
+
+- https://github.com/abetlen/ggml-python: these bindings seem to be hand-written and use [ctypes](https://docs.python.org/3/library/ctypes.html). It has [high-quality API reference docs](https://ggml-python.readthedocs.io/en/latest/api-reference/#ggml.ggml) that can be used with these bindings too, but it doesn't expose Metal, CUDA, MPI or OpenCL calls, doesn't support transparent (de/re)quantization like this example does (see [ggml.utils](./ggml/utils.py) module), and won't pick up your local changes.
+  
+- https://github.com/abetlen/llama-cpp-python: these expose the C++ `llama.cpp` interface, which this example cannot easily be extended to support (`cffi` only generates bindings of C libraries)
+
+- [pybind11](https://github.com/pybind/pybind11) and [nanobind](https://github.com/wjakob/nanobind) are two alternatives to cffi that support binding C++ libraries, but it doesn't seem either of them have an automatic generator (writing bindings is rather time-consuming).
diff --git a/stable-diffusion.cpp/ggml/examples/python/api.h b/stable-diffusion.cpp/ggml/examples/python/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d565bd562e061ad4ffdc5cdfeb5dc99b1237490
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/api.h
@@ -0,0 +1,14 @@
+/*
+  List here all the headers you want to expose in the Python bindings,
+  then run `python regenerate.py` (see details in README.md)
+*/
+
+#include "ggml.h"
+#include "ggml-metal.h"
+#include "ggml-opencl.h"
+
+// Headers below are currently only present in the llama.cpp repository, comment them out if you don't have them.
+#include "k_quants.h"
+#include "ggml-alloc.h"
+#include "ggml-cuda.h"
+#include "ggml-mpi.h"
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/examples/python/example_add_quant.py b/stable-diffusion.cpp/ggml/examples/python/example_add_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..cecb44eccd2c9155a1a28f34fb0878837e753ce1
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/example_add_quant.py
@@ -0,0 +1,25 @@
+from ggml import lib, ffi
+from ggml.utils import init, copy, numpy
+import numpy as np
+
+ctx = init(mem_size=12*1024*1024) # automatically freed when pointer is GC'd
+n = 256
+n_threads = 4
+
+a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
+b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) # can't both be quantized
+sum = lib.ggml_add(ctx, a, b) # all zeroes for now. Will be quantized too!
+
+# See cffi's doc on how to allocate native memory: it's very simple!
+# https://cffi.readthedocs.io/en/latest/ref.html#ffi-interface
+gf = ffi.new('struct ggml_cgraph*')
+lib.ggml_build_forward_expand(gf, sum)
+
+copy(np.array([i for i in range(n)], np.float32), a)
+copy(np.array([i*100 for i in range(n)], np.float32), b)
+
+lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads)
+
+print(numpy(a, allow_copy=True))
+print(numpy(b))
+print(numpy(sum, allow_copy=True))
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/examples/python/example_test_all_quants.py b/stable-diffusion.cpp/ggml/examples/python/example_test_all_quants.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d3c96657f38d19003ceac87115badb5db3e5874
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/example_test_all_quants.py
@@ -0,0 +1,68 @@
+from ggml import ffi, lib
+from ggml.utils import init, numpy, copy
+import numpy as np
+from math import pi, cos, sin, ceil
+
+import matplotlib.pyplot as plt
+
+ctx = init(mem_size=100*1024*1024) # Will be auto-GC'd
+n = 256
+
+orig = np.array([
+    [
+        cos(j * 2 * pi / n) * (sin(i * 2 * pi / n))
+        for j in range(n)
+    ]
+    for i in range(n)
+], np.float32)
+orig_tensor = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, n, n)
+copy(orig, orig_tensor)
+
+quants = [
+    type for type in range(lib.GGML_TYPE_COUNT)
+    if lib.ggml_is_quantized(type) and
+       type not in [lib.GGML_TYPE_Q8_1, lib.GGML_TYPE_Q8_K] # Apparently not supported
+]
+# quants = [lib.GGML_TYPE_Q2_K] # Test a single one
+
+def get_name(type):
+    name = lib.ggml_type_name(type)
+    return ffi.string(name).decode('utf-8') if name else '?'
+
+quants.sort(key=get_name)
+quants.insert(0, None)
+print(quants)
+
+ncols=4
+nrows = ceil(len(quants) / ncols)
+
+plt.figure(figsize=(ncols * 5, nrows * 5), layout='tight')
+
+for i, type in enumerate(quants):
+    plt.subplot(nrows, ncols, i + 1)
+    try:
+        if type == None:
+            plt.title('Original')
+            plt.imshow(orig)
+        else:
+            quantized_tensor = lib.ggml_new_tensor_2d(ctx, type, n, n)
+            copy(orig_tensor, quantized_tensor)
+            quantized = numpy(quantized_tensor, allow_copy=True)
+            d = quantized - orig
+            results = {
+                "l2": np.linalg.norm(d, 2),
+                "linf": np.linalg.norm(d, np.inf),
+                "compression":
+                    round(lib.ggml_nbytes(orig_tensor) /
+                          lib.ggml_nbytes(quantized_tensor), 1)
+            }
+            name = get_name(type)
+            print(f'{name}: {results}')
+
+            plt.title(f'{name} ({results["compression"]}x smaller)')
+            plt.imshow(quantized, interpolation='nearest')
+        
+    except Exception as e:
+        print(f'Error: {e}')
+
+plt.show()
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/examples/python/ggml/__init__.py b/stable-diffusion.cpp/ggml/examples/python/ggml/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a19102fc400fac5f2f513202e78990f2492732
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/ggml/__init__.py
@@ -0,0 +1,58 @@
+"""
+  Python bindings for the ggml library.
+
+  Usage example:
+
+      from ggml import lib, ffi
+      from ggml.utils import init, copy, numpy
+      import numpy as np
+
+      ctx = init(mem_size=10*1024*1024)
+      n = 1024
+      n_threads = 4
+
+      a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
+      b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
+      sum = lib.ggml_add(ctx, a, b)
+
+      gf = ffi.new('struct ggml_cgraph*')
+      lib.ggml_build_forward_expand(gf, sum)
+
+      copy(np.array([i for i in range(n)], np.float32), a)
+      copy(np.array([i*100 for i in range(n)], np.float32), b)
+      lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads)
+
+      print(numpy(sum, allow_copy=True))
+
+  See https://cffi.readthedocs.io/en/latest/cdef.html for more on cffi.
+"""
+
+try:
+    from ggml.cffi import ffi as ffi
+except ImportError as e:
+    raise ImportError(f"Couldn't find ggml bindings ({e}). Run `python regenerate.py` or check your PYTHONPATH.")
+
+import os, platform
+
+__exact_library = os.environ.get("GGML_LIBRARY")
+if __exact_library:
+    __candidates = [__exact_library]
+elif platform.system() == "Windows":
+    __candidates = ["ggml_shared.dll", "llama.dll"]
+else:
+    __candidates = ["libggml_shared.so", "libllama.so"]
+    if platform.system() == "Darwin":
+        __candidates += ["libggml_shared.dylib", "libllama.dylib"]
+
+for i, name in enumerate(__candidates):
+    try:
+        # This is where all the functions, enums and constants are defined
+        lib = ffi.dlopen(name)
+    except OSError:
+        if i < len(__candidates) - 1:
+            continue
+        raise OSError(f"Couldn't find ggml's shared library (tried names: {__candidates}). Add its directory to DYLD_LIBRARY_PATH (on Mac) or LD_LIBRARY_PATH, or define GGML_LIBRARY.")
+
+# This contains the cffi helpers such as new, cast, string, etc.
+# https://cffi.readthedocs.io/en/latest/ref.html#ffi-interface
+ffi = ffi
diff --git a/stable-diffusion.cpp/ggml/examples/python/ggml/__init__.pyi b/stable-diffusion.cpp/ggml/examples/python/ggml/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..1a764b0b13ba41366e17a3fb8b2a2959943b20de
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/ggml/__init__.pyi
@@ -0,0 +1,2431 @@
+# auto-generated file
+import ggml.ffi as ffi
+import numpy as np
+class lib:
+  @property
+  def GGML_BACKEND_CPU(self) -> int: ...
+  @property
+  def GGML_BACKEND_GPU(self) -> int: ...
+  @property
+  def GGML_BACKEND_GPU_SPLIT(self) -> int: ...
+  @property
+  def GGML_FTYPE_ALL_F32(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_F16(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q2_K(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q3_K(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q4_0(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q4_1(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q4_1_SOME_F16(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q4_K(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q5_0(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q5_1(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q5_K(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q6_K(self) -> int: ...
+  @property
+  def GGML_FTYPE_MOSTLY_Q8_0(self) -> int: ...
+  @property
+  def GGML_FTYPE_UNKNOWN(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_BACKTRACKING_ARMIJO(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_BACKTRACKING_WOLFE(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_DEFAULT(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_FAIL(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_INVALID_PARAMETERS(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_MAXIMUM_ITERATIONS(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_MAXIMUM_STEP(self) -> int: ...
+  @property
+  def GGML_LINESEARCH_MINIMUM_STEP(self) -> int: ...
+  @property
+  def GGML_OBJECT_GRAPH(self) -> int: ...
+  @property
+  def GGML_OBJECT_TENSOR(self) -> int: ...
+  @property
+  def GGML_OBJECT_WORK_BUFFER(self) -> int: ...
+  @property
+  def GGML_OPT_ADAM(self) -> int: ...
+  @property
+  def GGML_OPT_DID_NOT_CONVERGE(self) -> int: ...
+  @property
+  def GGML_OPT_FAIL(self) -> int: ...
+  @property
+  def GGML_OPT_INVALID_WOLFE(self) -> int: ...
+  @property
+  def GGML_OPT_LBFGS(self) -> int: ...
+  @property
+  def GGML_OPT_NO_CONTEXT(self) -> int: ...
+  @property
+  def GGML_OPT_OK(self) -> int: ...
+  @property
+  def GGML_OP_ACC(self) -> int: ...
+  @property
+  def GGML_OP_ADD(self) -> int: ...
+  @property
+  def GGML_OP_ADD1(self) -> int: ...
+  @property
+  def GGML_OP_ALIBI(self) -> int: ...
+  @property
+  def GGML_OP_ARGMAX(self) -> int: ...
+  @property
+  def GGML_OP_CLAMP(self) -> int: ...
+  @property
+  def GGML_OP_CONT(self) -> int: ...
+  @property
+  def GGML_OP_CONV_1D(self) -> int: ...
+  @property
+  def GGML_OP_CONV_2D(self) -> int: ...
+  @property
+  def GGML_OP_COUNT(self) -> int: ...
+  @property
+  def GGML_OP_CPY(self) -> int: ...
+  @property
+  def GGML_OP_CROSS_ENTROPY_LOSS(self) -> int: ...
+  @property
+  def GGML_OP_CROSS_ENTROPY_LOSS_BACK(self) -> int: ...
+  @property
+  def GGML_OP_DIAG(self) -> int: ...
+  @property
+  def GGML_OP_DIAG_MASK_INF(self) -> int: ...
+  @property
+  def GGML_OP_DIAG_MASK_ZERO(self) -> int: ...
+  @property
+  def GGML_OP_DIV(self) -> int: ...
+  @property
+  def GGML_OP_DUP(self) -> int: ...
+  @property
+  def GGML_OP_FLASH_ATTN(self) -> int: ...
+  @property
+  def GGML_OP_FLASH_ATTN_BACK(self) -> int: ...
+  @property
+  def GGML_OP_FLASH_FF(self) -> int: ...
+  @property
+  def GGML_OP_GET_ROWS(self) -> int: ...
+  @property
+  def GGML_OP_GET_ROWS_BACK(self) -> int: ...
+  @property
+  def GGML_OP_LOG(self) -> int: ...
+  @property
+  def GGML_OP_MAP_BINARY(self) -> int: ...
+  @property
+  def GGML_OP_MAP_CUSTOM1(self) -> int: ...
+  @property
+  def GGML_OP_MAP_CUSTOM1_F32(self) -> int: ...
+  @property
+  def GGML_OP_MAP_CUSTOM2(self) -> int: ...
+  @property
+  def GGML_OP_MAP_CUSTOM2_F32(self) -> int: ...
+  @property
+  def GGML_OP_MAP_CUSTOM3(self) -> int: ...
+  @property
+  def GGML_OP_MAP_CUSTOM3_F32(self) -> int: ...
+  @property
+  def GGML_OP_MAP_UNARY(self) -> int: ...
+  @property
+  def GGML_OP_MEAN(self) -> int: ...
+  @property
+  def GGML_OP_MUL(self) -> int: ...
+  @property
+  def GGML_OP_MUL_MAT(self) -> int: ...
+  @property
+  def GGML_OP_NONE(self) -> int: ...
+  @property
+  def GGML_OP_NORM(self) -> int: ...
+  @property
+  def GGML_OP_OUT_PROD(self) -> int: ...
+  @property
+  def GGML_OP_PERMUTE(self) -> int: ...
+  @property
+  def GGML_OP_POOL_1D(self) -> int: ...
+  @property
+  def GGML_OP_POOL_2D(self) -> int: ...
+  @property
+  def GGML_OP_POOL_AVG(self) -> int: ...
+  @property
+  def GGML_OP_POOL_COUNT(self) -> int: ...
+  @property
+  def GGML_OP_POOL_MAX(self) -> int: ...
+  @property
+  def GGML_OP_REPEAT(self) -> int: ...
+  @property
+  def GGML_OP_REPEAT_BACK(self) -> int: ...
+  @property
+  def GGML_OP_RESHAPE(self) -> int: ...
+  @property
+  def GGML_OP_RMS_NORM(self) -> int: ...
+  @property
+  def GGML_OP_RMS_NORM_BACK(self) -> int: ...
+  @property
+  def GGML_OP_ROPE(self) -> int: ...
+  @property
+  def GGML_OP_ROPE_BACK(self) -> int: ...
+  @property
+  def GGML_OP_SCALE(self) -> int: ...
+  @property
+  def GGML_OP_SET(self) -> int: ...
+  @property
+  def GGML_OP_SILU_BACK(self) -> int: ...
+  @property
+  def GGML_OP_SOFT_MAX(self) -> int: ...
+  @property
+  def GGML_OP_SOFT_MAX_BACK(self) -> int: ...
+  @property
+  def GGML_OP_SQR(self) -> int: ...
+  @property
+  def GGML_OP_SQRT(self) -> int: ...
+  @property
+  def GGML_OP_SUB(self) -> int: ...
+  @property
+  def GGML_OP_SUM(self) -> int: ...
+  @property
+  def GGML_OP_SUM_ROWS(self) -> int: ...
+  @property
+  def GGML_OP_TRANSPOSE(self) -> int: ...
+  @property
+  def GGML_OP_UNARY(self) -> int: ...
+  @property
+  def GGML_OP_VIEW(self) -> int: ...
+  @property
+  def GGML_OP_WIN_PART(self) -> int: ...
+  @property
+  def GGML_OP_WIN_UNPART(self) -> int: ...
+  @property
+  def GGML_TASK_COMPUTE(self) -> int: ...
+  @property
+  def GGML_TASK_FINALIZE(self) -> int: ...
+  @property
+  def GGML_TASK_INIT(self) -> int: ...
+  @property
+  def GGML_TYPE_COUNT(self) -> int: ...
+  @property
+  def GGML_TYPE_F16(self) -> int: ...
+  @property
+  def GGML_TYPE_F32(self) -> int: ...
+  @property
+  def GGML_TYPE_I16(self) -> int: ...
+  @property
+  def GGML_TYPE_I32(self) -> int: ...
+  @property
+  def GGML_TYPE_I8(self) -> int: ...
+  @property
+  def GGML_TYPE_Q2_K(self) -> int: ...
+  @property
+  def GGML_TYPE_Q3_K(self) -> int: ...
+  @property
+  def GGML_TYPE_Q4_0(self) -> int: ...
+  @property
+  def GGML_TYPE_Q4_1(self) -> int: ...
+  @property
+  def GGML_TYPE_Q4_K(self) -> int: ...
+  @property
+  def GGML_TYPE_Q5_0(self) -> int: ...
+  @property
+  def GGML_TYPE_Q5_1(self) -> int: ...
+  @property
+  def GGML_TYPE_Q5_K(self) -> int: ...
+  @property
+  def GGML_TYPE_Q6_K(self) -> int: ...
+  @property
+  def GGML_TYPE_Q8_0(self) -> int: ...
+  @property
+  def GGML_TYPE_Q8_1(self) -> int: ...
+  @property
+  def GGML_TYPE_Q8_K(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_ABS(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_ELU(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_GELU(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_GELU_QUICK(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_NEG(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_RELU(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_SGN(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_SILU(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_STEP(self) -> int: ...
+  @property
+  def GGML_UNARY_OP_TANH(self) -> int: ...
+  @property
+  def GGUF_TYPE_ARRAY(self) -> int: ...
+  @property
+  def GGUF_TYPE_BOOL(self) -> int: ...
+  @property
+  def GGUF_TYPE_COUNT(self) -> int: ...
+  @property
+  def GGUF_TYPE_FLOAT32(self) -> int: ...
+  @property
+  def GGUF_TYPE_INT16(self) -> int: ...
+  @property
+  def GGUF_TYPE_INT32(self) -> int: ...
+  @property
+  def GGUF_TYPE_INT8(self) -> int: ...
+  @property
+  def GGUF_TYPE_STRING(self) -> int: ...
+  @property
+  def GGUF_TYPE_UINT16(self) -> int: ...
+  @property
+  def GGUF_TYPE_UINT32(self) -> int: ...
+  @property
+  def GGUF_TYPE_UINT8(self) -> int: ...
+  def abort_callback(data: ffi.CData) -> bool:
+    """
+    abort ggml_graph_compute when true
+
+            bool (*abort_callback)(void * data);
+    """
+    ...
+  def dequantize_row_q2_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """
+    Dequantization
+
+    void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+    """
+    ...
+  def dequantize_row_q3_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);"""
+    ...
+  def dequantize_row_q4_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);"""
+    ...
+  def dequantize_row_q5_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);"""
+    ...
+  def dequantize_row_q6_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);"""
+    ...
+  def dequantize_row_q8_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);"""
+    ...
+  def ggml_abs(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_abs(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_abs_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_abs_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_acc(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_acc(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                nb1,
+                size_t                nb2,
+                size_t                nb3,
+                size_t                offset);
+    """
+    ...
+  def ggml_acc_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_acc_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                nb1,
+                size_t                nb2,
+                size_t                nb3,
+                size_t                offset);
+    """
+    ...
+  def ggml_add(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_add(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_add1(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_add1(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_add1_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_add1_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_add_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_add_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_alibi(ctx: ffi.CData, a: ffi.CData, n_past: int, n_head: int, bias_max: float) -> ffi.CData:
+    """
+    alibi position embedding
+    in-place, returns view(a)
+
+        struct ggml_tensor * ggml_alibi(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past,
+                int                   n_head,
+                float                 bias_max);
+    """
+    ...
+  def ggml_allocr_alloc(alloc: ffi.CData, tensor: ffi.CData) -> None:
+    """GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);"""
+    ...
+  def ggml_allocr_alloc_graph(alloc: ffi.CData, graph: ffi.CData) -> int:
+    """GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);"""
+    ...
+  def ggml_allocr_free(alloc: ffi.CData) -> None:
+    """GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);"""
+    ...
+  def ggml_allocr_is_measure(alloc: ffi.CData) -> bool:
+    """GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);"""
+    ...
+  def ggml_allocr_new(data: ffi.CData, size: int, alignment: int) -> ffi.CData:
+    """GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);"""
+    ...
+  def ggml_allocr_new_measure(alignment: int) -> ffi.CData:
+    """GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);"""
+    ...
+  def ggml_allocr_reset(alloc: ffi.CData) -> None:
+    """GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);"""
+    ...
+  def ggml_allocr_set_parse_seq(alloc: ffi.CData, list: ffi.CData, n: int) -> None:
+    """
+    tell the allocator to parse nodes following the order described in the list
+    you should call this if your graph are optimized to execute out-of-order
+
+    GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+    """
+    ...
+  def ggml_are_same_shape(t0: ffi.CData, t1: ffi.CData) -> bool:
+    """    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);"""
+    ...
+  def ggml_argmax(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    argmax along rows
+
+        GGML_API struct ggml_tensor * ggml_argmax(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_blck_size(type: int) -> int:
+    """    GGML_API int     ggml_blck_size (enum ggml_type type);"""
+    ...
+  def ggml_build_backward(ctx: ffi.CData, gf: ffi.CData, keep: bool) -> ffi.CData:
+    """    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);"""
+    ...
+  def ggml_build_forward(tensor: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);"""
+    ...
+  def ggml_build_forward_ctx(ctx: ffi.CData, tensor: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);"""
+    ...
+  def ggml_build_forward_expand(cgraph: ffi.CData, tensor: ffi.CData) -> None:
+    """    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);"""
+    ...
+  def ggml_cl_can_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> bool:
+    """bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
+    ...
+  def ggml_cl_free_data(tensor: ffi.CData) -> None:
+    """void ggml_cl_free_data(const struct ggml_tensor* tensor);"""
+    ...
+  def ggml_cl_host_free(ptr: ffi.CData) -> None:
+    """void   ggml_cl_host_free(void * ptr);"""
+    ...
+  def ggml_cl_host_malloc(size: int) -> ffi.CData:
+    """void * ggml_cl_host_malloc(size_t size);"""
+    ...
+  def ggml_cl_init() -> None:
+    """void ggml_cl_init(void);"""
+    ...
+  def ggml_cl_mul(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> None:
+    """void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
+    ...
+  def ggml_cl_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData, wdata: ffi.CData, wsize: int) -> None:
+    """void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);"""
+    ...
+  def ggml_cl_mul_mat_get_wsize(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> int:
+    """size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
+    ...
+  def ggml_cl_transform_tensor(data: ffi.CData, tensor: ffi.CData) -> None:
+    """void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);"""
+    ...
+  def ggml_clamp(ctx: ffi.CData, a: ffi.CData, min: float, max: float) -> ffi.CData:
+    """
+    clamp
+    in-place, returns view(a)
+
+        struct ggml_tensor * ggml_clamp(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                float                 min,
+                float                 max);
+    """
+    ...
+  def ggml_cont(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    make contiguous
+
+        GGML_API struct ggml_tensor * ggml_cont(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_cont_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    make contiguous, in-place
+
+        GGML_API struct ggml_tensor * ggml_cont_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_conv_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, p0: int, d0: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_conv_1d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                int                   s0,  // stride
+                int                   p0,  // padding
+                int                   d0); // dilation
+    """
+    ...
+  def ggml_conv_1d_ph(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s: int, d: int) -> ffi.CData:
+    """
+    conv_1d with padding = half
+    alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+
+        GGML_API struct ggml_tensor * ggml_conv_1d_ph(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                int                   s,
+                int                   d);
+    """
+    ...
+  def ggml_conv_2d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, s1: int, p0: int, p1: int, d0: int, d1: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_conv_2d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                int                   s0,
+                int                   s1,
+                int                   p0,
+                int                   p1,
+                int                   d0,
+                int                   d1);
+    """
+    ...
+  def ggml_cpu_has_arm_fma() -> int:
+    """    GGML_API int ggml_cpu_has_arm_fma    (void);"""
+    ...
+  def ggml_cpu_has_avx() -> int:
+    """    GGML_API int ggml_cpu_has_avx        (void);"""
+    ...
+  def ggml_cpu_has_avx2() -> int:
+    """    GGML_API int ggml_cpu_has_avx2       (void);"""
+    ...
+  def ggml_cpu_has_avx512() -> int:
+    """    GGML_API int ggml_cpu_has_avx512     (void);"""
+    ...
+  def ggml_cpu_has_avx512_vbmi() -> int:
+    """    GGML_API int ggml_cpu_has_avx512_vbmi(void);"""
+    ...
+  def ggml_cpu_has_avx512_vnni() -> int:
+    """    GGML_API int ggml_cpu_has_avx512_vnni(void);"""
+    ...
+  def ggml_cpu_has_blas() -> int:
+    """    GGML_API int ggml_cpu_has_blas       (void);"""
+    ...
+  def ggml_cpu_has_clblast() -> int:
+    """    GGML_API int ggml_cpu_has_clblast    (void);"""
+    ...
+  def ggml_cpu_has_cublas() -> int:
+    """    GGML_API int ggml_cpu_has_cublas     (void);"""
+    ...
+  def ggml_cpu_has_f16c() -> int:
+    """    GGML_API int ggml_cpu_has_f16c       (void);"""
+    ...
+  def ggml_cpu_has_fma() -> int:
+    """    GGML_API int ggml_cpu_has_fma        (void);"""
+    ...
+  def ggml_cpu_has_fp16_va() -> int:
+    """    GGML_API int ggml_cpu_has_fp16_va    (void);"""
+    ...
+  def ggml_cpu_has_gpublas() -> int:
+    """    GGML_API int ggml_cpu_has_gpublas    (void);"""
+    ...
+  def ggml_cpu_has_neon() -> int:
+    """    GGML_API int ggml_cpu_has_neon       (void);"""
+    ...
+  def ggml_cpu_has_sse3() -> int:
+    """    GGML_API int ggml_cpu_has_sse3       (void);"""
+    ...
+  def ggml_cpu_has_vsx() -> int:
+    """    GGML_API int ggml_cpu_has_vsx        (void);"""
+    ...
+  def ggml_cpu_has_wasm_simd() -> int:
+    """    GGML_API int ggml_cpu_has_wasm_simd  (void);"""
+    ...
+  def ggml_cpy(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    a -> b, return view(b)
+
+        GGML_API struct ggml_tensor * ggml_cpy(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_cpy_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    a -> b, in-place, return view(b)
+
+        GGML_API struct ggml_tensor * ggml_cpy_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_cross_entropy_loss(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
+                struct ggml_context         * ctx,
+                struct ggml_tensor          * a,
+                struct ggml_tensor          * b);
+    """
+    ...
+  def ggml_cross_entropy_loss_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
+                struct ggml_context         * ctx,
+                struct ggml_tensor          * a,
+                struct ggml_tensor          * b,
+                struct ggml_tensor          * c);
+    """
+    ...
+  def ggml_cuda_assign_buffers(tensor: ffi.CData) -> None:
+    """GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);"""
+    ...
+  def ggml_cuda_assign_buffers_force_inplace(tensor: ffi.CData) -> None:
+    """GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);"""
+    ...
+  def ggml_cuda_assign_buffers_no_scratch(tensor: ffi.CData) -> None:
+    """GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);"""
+    ...
+  def ggml_cuda_can_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> bool:
+    """GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
+    ...
+  def ggml_cuda_compute_forward(params: ffi.CData, tensor: ffi.CData) -> bool:
+    """GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);"""
+    ...
+  def ggml_cuda_free_data(tensor: ffi.CData) -> None:
+    """GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);"""
+    ...
+  def ggml_cuda_free_scratch() -> None:
+    """GGML_API void   ggml_cuda_free_scratch(void);"""
+    ...
+  def ggml_cuda_get_device_count() -> int:
+    """GGML_API int    ggml_cuda_get_device_count(void);"""
+    ...
+  def ggml_cuda_get_device_description(device: int, description: ffi.CData, description_size: int) -> None:
+    """GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);"""
+    ...
+  def ggml_cuda_host_free(ptr: ffi.CData) -> None:
+    """GGML_API void   ggml_cuda_host_free(void * ptr);"""
+    ...
+  def ggml_cuda_host_malloc(size: int) -> ffi.CData:
+    """GGML_API void * ggml_cuda_host_malloc(size_t size);"""
+    ...
+  def ggml_cuda_set_main_device(main_device: int) -> None:
+    """GGML_API void   ggml_cuda_set_main_device(int main_device);"""
+    ...
+  def ggml_cuda_set_mul_mat_q(mul_mat_q: bool) -> None:
+    """GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);"""
+    ...
+  def ggml_cuda_set_scratch_size(scratch_size: int) -> None:
+    """GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);"""
+    ...
+  def ggml_cuda_set_tensor_split(tensor_split: ffi.CData) -> None:
+    """GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);"""
+    ...
+  def ggml_cuda_transform_tensor(data: ffi.CData, tensor: ffi.CData) -> None:
+    """GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);"""
+    ...
+  def ggml_cycles() -> int:
+    """    GGML_API int64_t ggml_cycles(void);"""
+    ...
+  def ggml_cycles_per_ms() -> int:
+    """    GGML_API int64_t ggml_cycles_per_ms(void);"""
+    ...
+  def ggml_diag(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_diag(
+            struct ggml_context     * ctx,
+            struct ggml_tensor      * a);
+    """
+    ...
+  def ggml_diag_mask_inf(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
+    """
+    set elements above the diagonal to -INF
+
+        GGML_API struct ggml_tensor * ggml_diag_mask_inf(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past);
+    """
+    ...
+  def ggml_diag_mask_inf_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past);
+    """
+    ...
+  def ggml_diag_mask_zero(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
+    """
+    set elements above the diagonal to 0
+
+        GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past);
+    """
+    ...
+  def ggml_diag_mask_zero_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past);
+    """
+    ...
+  def ggml_div(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_div(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_div_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_div_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_dup(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_dup(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_dup_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_dup_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_dup_tensor(ctx: ffi.CData, src: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);"""
+    ...
+  def ggml_element_size(tensor: ffi.CData) -> int:
+    """    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_elu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_elu(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_elu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_elu_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_flash_attn(ctx: ffi.CData, q: ffi.CData, k: ffi.CData, v: ffi.CData, masked: bool) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_flash_attn(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * q,
+                struct ggml_tensor  * k,
+                struct ggml_tensor  * v,
+                bool                  masked);
+    """
+    ...
+  def ggml_flash_attn_back(ctx: ffi.CData, q: ffi.CData, k: ffi.CData, v: ffi.CData, d: ffi.CData, masked: bool) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_flash_attn_back(
+               struct ggml_context * ctx,
+               struct ggml_tensor  * q,
+               struct ggml_tensor  * k,
+               struct ggml_tensor  * v,
+               struct ggml_tensor  * d,
+               bool                  masked);
+    """
+    ...
+  def ggml_flash_ff(ctx: ffi.CData, a: ffi.CData, b0: ffi.CData, b1: ffi.CData, c0: ffi.CData, c1: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_flash_ff(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b0,
+                struct ggml_tensor  * b1,
+                struct ggml_tensor  * c0,
+                struct ggml_tensor  * c1);
+    """
+    ...
+  def ggml_format_name(tensor: ffi.CData, fmt: ffi.CData, *args2) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);"""
+    ...
+  def ggml_fp16_to_fp32(x: np.float16) -> float:
+    """
+    convert FP16 <-> FP32
+
+        GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    """
+    ...
+  def ggml_fp16_to_fp32_row(x: ffi.CData, y: ffi.CData, n: int) -> None:
+    """    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);"""
+    ...
+  def ggml_fp32_to_fp16(x: float) -> np.float16:
+    """    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);"""
+    ...
+  def ggml_fp32_to_fp16_row(x: ffi.CData, y: ffi.CData, n: int) -> None:
+    """    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);"""
+    ...
+  def ggml_free(ctx: ffi.CData) -> None:
+    """    GGML_API void                  ggml_free(struct ggml_context * ctx);"""
+    ...
+  def ggml_ftype_to_ggml_type(ftype: int) -> int:
+    """
+    TODO: temporary until model loading of ggml examples is refactored
+
+        GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+    """
+    ...
+  def ggml_gelu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    TODO: double-check this computation is correct
+
+        GGML_API struct ggml_tensor * ggml_gelu(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_gelu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_gelu_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_gelu_quick(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_gelu_quick(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_gelu_quick_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_get_data(tensor: ffi.CData) -> ffi.CData:
+    """    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_get_data_f32(tensor: ffi.CData) -> ffi.CData:
+    """    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_get_f32_1d(tensor: ffi.CData, i: int) -> float:
+    """    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);"""
+    ...
+  def ggml_get_i32_1d(tensor: ffi.CData, i: int) -> int:
+    """    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);"""
+    ...
+  def ggml_get_max_tensor_size(ctx: ffi.CData) -> int:
+    """    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);"""
+    ...
+  def ggml_get_mem_buffer(ctx: ffi.CData) -> ffi.CData:
+    """    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);"""
+    ...
+  def ggml_get_mem_size(ctx: ffi.CData) -> int:
+    """    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);"""
+    ...
+  def ggml_get_name(tensor: ffi.CData) -> ffi.CData:
+    """    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_get_no_alloc(ctx: ffi.CData) -> bool:
+    """    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);"""
+    ...
+  def ggml_get_rows(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_get_rows(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_get_rows_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_get_rows_back(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                struct ggml_tensor  * c);
+    """
+    ...
+  def ggml_get_tensor(ctx: ffi.CData, name: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);"""
+    ...
+  def ggml_get_unary_op(tensor: ffi.CData) -> int:
+    """    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_graph_compute(cgraph: ffi.CData, cplan: ffi.CData) -> int:
+    """    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);"""
+    ...
+  def ggml_graph_compute_with_ctx(ctx: ffi.CData, cgraph: ffi.CData, n_threads: int) -> None:
+    """
+    same as ggml_graph_compute() but the work data is allocated as a part of the context
+    note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+
+        GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    """
+    ...
+  def ggml_graph_dump_dot(gb: ffi.CData, gf: ffi.CData, filename: ffi.CData) -> None:
+    """
+    dump the graph into a file using the dot format
+
+        GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+    """
+    ...
+  def ggml_graph_export(cgraph: ffi.CData, fname: ffi.CData) -> None:
+    """    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);"""
+    ...
+  def ggml_graph_get_tensor(cgraph: ffi.CData, name: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);"""
+    ...
+  def ggml_graph_import(fname: ffi.CData, ctx_data: ffi.CData, ctx_eval: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);"""
+    ...
+  def ggml_graph_overhead() -> int:
+    """    GGML_API size_t ggml_graph_overhead(void);"""
+    ...
+  def ggml_graph_plan(cgraph: ffi.CData, n_threads: int) -> ffi.CData:
+    """
+    ggml_graph_plan() has to be called before ggml_graph_compute()
+    when plan.work_size > 0, caller must allocate memory for plan.work_data
+
+        GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    """
+    ...
+  def ggml_graph_print(cgraph: ffi.CData) -> None:
+    """
+    print info and performance information for the graph
+
+        GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
+    """
+    ...
+  def ggml_graph_reset(cgraph: ffi.CData) -> None:
+    """    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);"""
+    ...
+  def ggml_init(params: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);"""
+    ...
+  def ggml_init_cublas() -> None:
+    """GGML_API void   ggml_init_cublas(void);"""
+    ...
+  def ggml_internal_get_type_traits(type: int) -> ffi.CData:
+    """    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);"""
+    ...
+  def ggml_is_contiguous(tensor: ffi.CData) -> bool:
+    """    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_is_numa() -> bool:
+    """    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node"""
+    ...
+  def ggml_is_permuted(tensor: ffi.CData) -> bool:
+    """    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_is_quantized(type: int) -> bool:
+    """    GGML_API bool    ggml_is_quantized(enum ggml_type type);"""
+    ...
+  def ggml_is_transposed(tensor: ffi.CData) -> bool:
+    """    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_log(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_log(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_log_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_log_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_map_binary_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
+                struct ggml_context         * ctx,
+                struct ggml_tensor          * a,
+                struct ggml_tensor          * b,
+                       ggml_binary_op_f32_t   fun),
+            "use ggml_map_custom2 instead");
+    """
+    ...
+  def ggml_map_binary_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+                struct ggml_context         * ctx,
+                struct ggml_tensor          * a,
+                struct ggml_tensor          * b,
+                       ggml_binary_op_f32_t   fun),
+            "use ggml_map_custom2_inplace instead");
+    """
+    ...
+  def ggml_map_custom1(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_map_custom1(
+                struct ggml_context   * ctx,
+                struct ggml_tensor    * a,
+                ggml_custom1_op_t       fun,
+                int                     n_tasks,
+                void                  * userdata);
+    """
+    ...
+  def ggml_map_custom1_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+                struct ggml_context          * ctx,
+                struct ggml_tensor           * a,
+                       ggml_custom1_op_f32_t   fun),
+            "use ggml_map_custom1 instead");
+    """
+    ...
+  def ggml_map_custom1_inplace(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
+                struct ggml_context   * ctx,
+                struct ggml_tensor    * a,
+                ggml_custom1_op_t       fun,
+                int                     n_tasks,
+                void                  * userdata);
+    """
+    ...
+  def ggml_map_custom1_inplace_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+                struct ggml_context          * ctx,
+                struct ggml_tensor           * a,
+                       ggml_custom1_op_f32_t   fun),
+            "use ggml_map_custom1_inplace instead");
+    """
+    ...
+  def ggml_map_custom2(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_map_custom2(
+                struct ggml_context   * ctx,
+                struct ggml_tensor    * a,
+                struct ggml_tensor    * b,
+                ggml_custom2_op_t       fun,
+                int                     n_tasks,
+                void                  * userdata);
+    """
+    ...
+  def ggml_map_custom2_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+                struct ggml_context          * ctx,
+                struct ggml_tensor           * a,
+                struct ggml_tensor           * b,
+                       ggml_custom2_op_f32_t   fun),
+            "use ggml_map_custom2 instead");
+    """
+    ...
+  def ggml_map_custom2_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
+                struct ggml_context   * ctx,
+                struct ggml_tensor    * a,
+                struct ggml_tensor    * b,
+                ggml_custom2_op_t       fun,
+                int                     n_tasks,
+                void                  * userdata);
+    """
+    ...
+  def ggml_map_custom2_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+                struct ggml_context          * ctx,
+                struct ggml_tensor           * a,
+                struct ggml_tensor           * b,
+                       ggml_custom2_op_f32_t   fun),
+            "use ggml_map_custom2_inplace instead");
+    """
+    ...
+  def ggml_map_custom3(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_map_custom3(
+                struct ggml_context   * ctx,
+                struct ggml_tensor    * a,
+                struct ggml_tensor    * b,
+                struct ggml_tensor    * c,
+                ggml_custom3_op_t       fun,
+                int                     n_tasks,
+                void                  * userdata);
+    """
+    ...
+  def ggml_map_custom3_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+                struct ggml_context          * ctx,
+                struct ggml_tensor           * a,
+                struct ggml_tensor           * b,
+                struct ggml_tensor           * c,
+                       ggml_custom3_op_f32_t   fun),
+            "use ggml_map_custom3 instead");
+    """
+    ...
+  def ggml_map_custom3_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
+                struct ggml_context   * ctx,
+                struct ggml_tensor    * a,
+                struct ggml_tensor    * b,
+                struct ggml_tensor    * c,
+                ggml_custom3_op_t       fun,
+                int                     n_tasks,
+                void                  * userdata);
+    """
+    ...
+  def ggml_map_custom3_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+                struct ggml_context          * ctx,
+                struct ggml_tensor           * a,
+                struct ggml_tensor           * b,
+                struct ggml_tensor           * c,
+                       ggml_custom3_op_f32_t   fun),
+            "use ggml_map_custom3_inplace instead");
+    """
+    ...
+  def ggml_map_unary_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
+                struct ggml_context        * ctx,
+                struct ggml_tensor         * a,
+                       ggml_unary_op_f32_t   fun),
+            "use ggml_map_custom1 instead");
+    """
+    ...
+  def ggml_map_unary_inplace_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
+    """
+        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+                struct ggml_context        * ctx,
+                struct ggml_tensor         * a,
+                       ggml_unary_op_f32_t   fun),
+            "use ggml_map_custom1_inplace instead");
+    """
+    ...
+  def ggml_mean(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    mean along rows
+
+        GGML_API struct ggml_tensor * ggml_mean(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_metal_add_buffer(ctx: ffi.CData, name: ffi.CData, data: ffi.CData, size: int, max_size: int) -> bool:
+    """
+    creates a mapping between a host memory buffer and a device memory buffer
+    - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+    - the mapping is used during computation to determine the arguments of the compute kernels
+    - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+    - max_size specifies the maximum size of a tensor and is used to create shared views such
+    that it is guaranteed that the tensor will fit in at least one of the views
+    
+
+    bool ggml_metal_add_buffer(
+            struct ggml_metal_context * ctx,
+                           const char * name,
+                                 void * data,
+                               size_t   size,
+                               size_t   max_size);
+    """
+    ...
+  def ggml_metal_free(ctx: ffi.CData) -> None:
+    """void ggml_metal_free(struct ggml_metal_context * ctx);"""
+    ...
+  def ggml_metal_get_concur_list(ctx: ffi.CData) -> ffi.CData:
+    """
+    output the concur_list for ggml_alloc
+
+    int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
+    """
+    ...
+  def ggml_metal_get_tensor(ctx: ffi.CData, t: ffi.CData) -> None:
+    """
+    get data from the device into host memory
+
+    void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+    """
+    ...
+  def ggml_metal_graph_compute(ctx: ffi.CData, gf: ffi.CData) -> None:
+    """
+    same as ggml_graph_compute but uses Metal
+    creates gf->n_threads command buffers in parallel
+
+    void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+    """
+    ...
+  def ggml_metal_graph_find_concurrency(ctx: ffi.CData, gf: ffi.CData, check_mem: bool) -> None:
+    """
+    try to find operations that can be run concurrently in the graph
+    you should run it again if the topology of your graph changes
+
+    void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
+    """
+    ...
+  def ggml_metal_host_free(data: ffi.CData) -> None:
+    """void   ggml_metal_host_free  (void * data);"""
+    ...
+  def ggml_metal_host_malloc(n: int) -> ffi.CData:
+    """void * ggml_metal_host_malloc(size_t n);"""
+    ...
+  def ggml_metal_if_optimized(ctx: ffi.CData) -> int:
+    """
+    if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+
+    int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+    """
+    ...
+  def ggml_metal_init(n_cb: int) -> ffi.CData:
+    """
+    number of command buffers to use
+
+    struct ggml_metal_context * ggml_metal_init(int n_cb);
+    """
+    ...
+  def ggml_metal_set_n_cb(ctx: ffi.CData, n_cb: int) -> None:
+    """
+    set the number of command buffers to use
+
+    void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+    """
+    ...
+  def ggml_metal_set_tensor(ctx: ffi.CData, t: ffi.CData) -> None:
+    """
+    set data from host memory into the device
+
+    void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+    """
+    ...
+  def ggml_mpi_backend_free() -> None:
+    """void ggml_mpi_backend_free(void);"""
+    ...
+  def ggml_mpi_backend_init() -> None:
+    """void ggml_mpi_backend_init(void);"""
+    ...
+  def ggml_mpi_eval_init(ctx_mpi: ffi.CData, n_tokens: ffi.CData, n_past: ffi.CData, n_threads: ffi.CData) -> None:
+    """
+    void ggml_mpi_eval_init(
+            struct ggml_mpi_context * ctx_mpi,
+                                int * n_tokens,
+                                int * n_past,
+                                int * n_threads);
+    """
+    ...
+  def ggml_mpi_free(ctx: ffi.CData) -> None:
+    """void ggml_mpi_free(struct ggml_mpi_context * ctx);"""
+    ...
+  def ggml_mpi_graph_compute_post(ctx_mpi: ffi.CData, gf: ffi.CData, n_layers: int) -> None:
+    """
+    void ggml_mpi_graph_compute_post(
+            struct ggml_mpi_context * ctx_mpi,
+                 struct ggml_cgraph * gf,
+                                int   n_layers);
+    """
+    ...
+  def ggml_mpi_graph_compute_pre(ctx_mpi: ffi.CData, gf: ffi.CData, n_layers: int) -> None:
+    """
+    void ggml_mpi_graph_compute_pre(
+            struct ggml_mpi_context * ctx_mpi,
+                 struct ggml_cgraph * gf,
+                                int   n_layers);
+    """
+    ...
+  def ggml_mpi_init() -> ffi.CData:
+    """struct ggml_mpi_context * ggml_mpi_init(void);"""
+    ...
+  def ggml_mpi_rank(ctx: ffi.CData) -> int:
+    """int ggml_mpi_rank(struct ggml_mpi_context * ctx);"""
+    ...
+  def ggml_mul(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_mul(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_mul_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_mul_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_mul_mat(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    A: n columns, m rows
+    B: n columns, p rows  (i.e. we transpose it internally)
+    result is m columns, p rows
+
+        GGML_API struct ggml_tensor * ggml_mul_mat(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_nbytes(tensor: ffi.CData) -> int:
+    """    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_nbytes_pad(tensor: ffi.CData) -> int:
+    """    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN"""
+    ...
+  def ggml_nbytes_split(tensor: ffi.CData, nrows_split: int) -> int:
+    """    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);"""
+    ...
+  def ggml_neg(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_neg(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_neg_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_neg_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_nelements(tensor: ffi.CData) -> int:
+    """    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_new_f32(ctx: ffi.CData, value: float) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);"""
+    ...
+  def ggml_new_graph(ctx: ffi.CData) -> ffi.CData:
+    """
+    graph allocation in a context
+
+        GGML_API struct ggml_cgraph * ggml_new_graph        (struct ggml_context * ctx);
+    """
+    ...
+  def ggml_new_i32(ctx: ffi.CData, value: int) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);"""
+    ...
+  def ggml_new_tensor(ctx: ffi.CData, type: int, n_dims: int, ne: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_new_tensor(
+                struct ggml_context * ctx,
+                enum   ggml_type type,
+                int    n_dims,
+                const int64_t *ne);
+    """
+    ...
+  def ggml_new_tensor_1d(ctx: ffi.CData, type: int, ne0: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_new_tensor_1d(
+                struct ggml_context * ctx,
+                enum   ggml_type type,
+                int64_t ne0);
+    """
+    ...
+  def ggml_new_tensor_2d(ctx: ffi.CData, type: int, ne0: int, ne1: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_new_tensor_2d(
+                struct ggml_context * ctx,
+                enum   ggml_type type,
+                int64_t ne0,
+                int64_t ne1);
+    """
+    ...
+  def ggml_new_tensor_3d(ctx: ffi.CData, type: int, ne0: int, ne1: int, ne2: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_new_tensor_3d(
+                struct ggml_context * ctx,
+                enum   ggml_type type,
+                int64_t ne0,
+                int64_t ne1,
+                int64_t ne2);
+    """
+    ...
+  def ggml_new_tensor_4d(ctx: ffi.CData, type: int, ne0: int, ne1: int, ne2: int, ne3: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_new_tensor_4d(
+                struct ggml_context * ctx,
+                enum   ggml_type type,
+                int64_t ne0,
+                int64_t ne1,
+                int64_t ne2,
+                int64_t ne3);
+    """
+    ...
+  def ggml_norm(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    normalize along rows
+    TODO: eps is hardcoded to 1e-5 for now
+
+        GGML_API struct ggml_tensor * ggml_norm(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_norm_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_norm_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_nrows(tensor: ffi.CData) -> int:
+    """    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);"""
+    ...
+  def ggml_numa_init() -> None:
+    """    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems"""
+    ...
+  def ggml_op_name(op: int) -> ffi.CData:
+    """    GGML_API const char * ggml_op_name  (enum ggml_op   op);"""
+    ...
+  def ggml_op_symbol(op: int) -> ffi.CData:
+    """    GGML_API const char * ggml_op_symbol(enum ggml_op   op);"""
+    ...
+  def ggml_opt(ctx: ffi.CData, params: ffi.CData, f: ffi.CData) -> int:
+    """
+    optimize the function defined by the tensor f
+
+        GGML_API enum ggml_opt_result ggml_opt(
+                struct ggml_context * ctx,
+                struct ggml_opt_params params,
+                struct ggml_tensor * f);
+    """
+    ...
+  def ggml_opt_default_params(type: int) -> ffi.CData:
+    """    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);"""
+    ...
+  def ggml_opt_init(ctx: ffi.CData, opt: ffi.CData, params: ffi.CData, nx: int) -> None:
+    """
+    initialize optimizer context
+
+        GGML_API void ggml_opt_init(
+                struct ggml_context * ctx,
+                struct ggml_opt_context * opt,
+                struct ggml_opt_params params,
+                int64_t nx);
+    """
+    ...
+  def ggml_opt_resume(ctx: ffi.CData, opt: ffi.CData, f: ffi.CData) -> int:
+    """
+    continue optimizing the function defined by the tensor f
+
+        GGML_API enum ggml_opt_result ggml_opt_resume(
+                struct ggml_context * ctx,
+                struct ggml_opt_context * opt,
+                struct ggml_tensor * f);
+    """
+    ...
+  def ggml_opt_resume_g(ctx: ffi.CData, opt: ffi.CData, f: ffi.CData, gf: ffi.CData, gb: ffi.CData) -> int:
+    """
+    continue optimizing the function defined by the tensor f
+
+        GGML_API enum ggml_opt_result ggml_opt_resume_g(
+                struct ggml_context * ctx,
+                struct ggml_opt_context * opt,
+                struct ggml_tensor * f,
+                struct ggml_cgraph * gf,
+                struct ggml_cgraph * gb);
+    """
+    ...
+  def ggml_out_prod(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    A: m columns, n rows,
+    B: p columns, n rows,
+    result is m columns, p rows
+
+        GGML_API struct ggml_tensor * ggml_out_prod(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_permute(ctx: ffi.CData, a: ffi.CData, axis0: int, axis1: int, axis2: int, axis3: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_permute(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   axis0,
+                int                   axis1,
+                int                   axis2,
+                int                   axis3);
+    """
+    ...
+  def ggml_pool_1d(ctx: ffi.CData, a: ffi.CData, op: int, k0: int, s0: int, p0: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_pool_1d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                enum ggml_op_pool     op,
+                int                   k0, // kernel size
+                int                   s0, // stride
+                int                   p0); // padding
+    """
+    ...
+  def ggml_pool_2d(ctx: ffi.CData, a: ffi.CData, op: int, k0: int, k1: int, s0: int, s1: int, p0: int, p1: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_pool_2d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                enum ggml_op_pool     op,
+                int                   k0,
+                int                   k1,
+                int                   s0,
+                int                   s1,
+                int                   p0,
+                int                   p1);
+    """
+    ...
+  def ggml_print_object(obj: ffi.CData) -> None:
+    """    GGML_API void    ggml_print_object (const struct ggml_object * obj);"""
+    ...
+  def ggml_print_objects(ctx: ffi.CData) -> None:
+    """    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);"""
+    ...
+  def ggml_quantize_chunk(type: int, src: ffi.CData, dst: ffi.CData, start: int, n: int, hist: ffi.CData) -> int:
+    """    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);"""
+    ...
+  def ggml_quantize_q2_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """
+    Quantization with histogram collection
+
+    size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    """
+    ...
+  def ggml_quantize_q3_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q4_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q4_1(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q4_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q5_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q5_1(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q5_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q6_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_quantize_q8_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
+    """    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);"""
+    ...
+  def ggml_relu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_relu(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_relu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_relu_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_repeat(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    if a is the same shape as b, and a is not parameter, return a
+    otherwise, return a new tensor: repeat(a) to fit in b
+
+        GGML_API struct ggml_tensor * ggml_repeat(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_repeat_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_repeat_back(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_reshape(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    return view(a), b specifies the new shape
+    TODO: when we start computing gradient, make a copy instead of view
+
+        GGML_API struct ggml_tensor * ggml_reshape(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_reshape_1d(ctx: ffi.CData, a: ffi.CData, ne0: int) -> ffi.CData:
+    """
+    return view(a)
+    TODO: when we start computing gradient, make a copy instead of view
+
+        GGML_API struct ggml_tensor * ggml_reshape_1d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0);
+    """
+    ...
+  def ggml_reshape_2d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_reshape_2d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0,
+                int64_t               ne1);
+    """
+    ...
+  def ggml_reshape_3d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int) -> ffi.CData:
+    """
+    return view(a)
+    TODO: when we start computing gradient, make a copy instead of view
+
+        GGML_API struct ggml_tensor * ggml_reshape_3d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0,
+                int64_t               ne1,
+                int64_t               ne2);
+    """
+    ...
+  def ggml_reshape_4d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, ne3: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_reshape_4d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0,
+                int64_t               ne1,
+                int64_t               ne2,
+                int64_t               ne3);
+    """
+    ...
+  def ggml_rms_norm(ctx: ffi.CData, a: ffi.CData, eps: float) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_rms_norm(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                float                 eps);
+    """
+    ...
+  def ggml_rms_norm_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    a - x
+    b - dy
+    TODO: update with configurable eps
+
+        GGML_API struct ggml_tensor * ggml_rms_norm_back(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_rms_norm_inplace(ctx: ffi.CData, a: ffi.CData, eps: float) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                float                 eps);
+    """
+    ...
+  def ggml_rope(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData:
+    """
+    rotary position embedding
+    if mode & 1 == 1, skip n_past elements
+    if mode & 2 == 1, GPT-NeoX style
+    if mode & 4 == 1, ChatGLM style
+    TODO: avoid creating a new tensor every time
+
+        GGML_API struct ggml_tensor * ggml_rope(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past,
+                int                   n_dims,
+                int                   mode,
+                int                   n_ctx);
+    """
+    ...
+  def ggml_rope_back(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData:
+    """
+    rotary position embedding backward, i.e compute dx from dy
+    a - dy
+
+        GGML_API struct ggml_tensor * ggml_rope_back(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past,
+                int                   n_dims,
+                int                   mode,
+                int                   n_ctx);
+    """
+    ...
+  def ggml_rope_custom(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int, freq_base: float, freq_scale: float) -> ffi.CData:
+    """
+    custom RoPE
+
+        GGML_API struct ggml_tensor * ggml_rope_custom(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past,
+                int                   n_dims,
+                int                   mode,
+                int                   n_ctx,
+                float                 freq_base,
+                float                 freq_scale);
+    """
+    ...
+  def ggml_rope_custom_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int, freq_base: float, freq_scale: float) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past,
+                int                   n_dims,
+                int                   mode,
+                int                   n_ctx,
+                float                 freq_base,
+                float                 freq_scale);
+    """
+    ...
+  def ggml_rope_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_rope_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   n_past,
+                int                   n_dims,
+                int                   mode,
+                int                   n_ctx);
+    """
+    ...
+  def ggml_scale(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_scale(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_scale_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_scale_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_set(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
+    """
+    b -> view(a,offset,nb1,nb2,3), return modified a
+
+        GGML_API struct ggml_tensor * ggml_set(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                nb1,
+                size_t                nb2,
+                size_t                nb3,
+                size_t                offset);
+    """
+    ...
+  def ggml_set_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, offset: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_set_1d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                offset);
+    """
+    ...
+  def ggml_set_1d_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, offset: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                offset);
+    """
+    ...
+  def ggml_set_2d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, offset: int) -> ffi.CData:
+    """
+    b -> view(a,offset,nb1,nb2,3), return modified a
+
+        GGML_API struct ggml_tensor * ggml_set_2d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                nb1,
+                size_t                offset);
+    """
+    ...
+  def ggml_set_2d_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, offset: int) -> ffi.CData:
+    """
+    b -> view(a,offset,nb1,nb2,3), return view(a)
+
+        GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                nb1,
+                size_t                offset);
+    """
+    ...
+  def ggml_set_f32(tensor: ffi.CData, value: float) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);"""
+    ...
+  def ggml_set_f32_1d(tensor: ffi.CData, i: int, value: float) -> None:
+    """    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);"""
+    ...
+  def ggml_set_i32(tensor: ffi.CData, value: int) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);"""
+    ...
+  def ggml_set_i32_1d(tensor: ffi.CData, i: int, value: int) -> None:
+    """    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);"""
+    ...
+  def ggml_set_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
+    """
+    b -> view(a,offset,nb1,nb2,3), return view(a)
+
+        GGML_API struct ggml_tensor * ggml_set_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                size_t                nb1,
+                size_t                nb2,
+                size_t                nb3,
+                size_t                offset);
+    """
+    ...
+  def ggml_set_name(tensor: ffi.CData, name: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);"""
+    ...
+  def ggml_set_no_alloc(ctx: ffi.CData, no_alloc: bool) -> None:
+    """    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);"""
+    ...
+  def ggml_set_param(ctx: ffi.CData, tensor: ffi.CData) -> None:
+    """
+        GGML_API void ggml_set_param(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * tensor);
+    """
+    ...
+  def ggml_set_scratch(ctx: ffi.CData, scratch: ffi.CData) -> int:
+    """    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);"""
+    ...
+  def ggml_set_zero(tensor: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);"""
+    ...
+  def ggml_sgn(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sgn(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_sgn_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sgn_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_silu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_silu(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_silu_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    a - x
+    b - dy
+
+        GGML_API struct ggml_tensor * ggml_silu_back(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_silu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_silu_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_soft_max(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_soft_max(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_soft_max_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_soft_max_back(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_soft_max_back_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_soft_max_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    in-place, returns view(a)
+
+        GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_sqr(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sqr(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_sqr_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sqr_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_sqrt(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sqrt(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_sqrt_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sqrt_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_step(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_step(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_step_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_step_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_sub(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sub(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_sub_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_sub_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b);
+    """
+    ...
+  def ggml_sum(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    return scalar
+
+        GGML_API struct ggml_tensor * ggml_sum(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_sum_rows(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+
+        GGML_API struct ggml_tensor * ggml_sum_rows(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_tanh(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_tanh(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_tanh_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_tanh_inplace(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_tensor_overhead() -> int:
+    """
+    use this to compute the memory overhead of a tensor
+
+        GGML_API size_t ggml_tensor_overhead(void);
+    """
+    ...
+  def ggml_time_init() -> None:
+    """    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program"""
+    ...
+  def ggml_time_ms() -> int:
+    """    GGML_API int64_t ggml_time_ms(void);"""
+    ...
+  def ggml_time_us() -> int:
+    """    GGML_API int64_t ggml_time_us(void);"""
+    ...
+  def ggml_transpose(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
+    """
+    alias for ggml_permute(ctx, a, 1, 0, 2, 3)
+
+        GGML_API struct ggml_tensor * ggml_transpose(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a);
+    """
+    ...
+  def ggml_type_name(type: int) -> ffi.CData:
+    """    GGML_API const char * ggml_type_name(enum ggml_type type);"""
+    ...
+  def ggml_type_size(type: int) -> int:
+    """    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block"""
+    ...
+  def ggml_type_sizef(type: int) -> float:
+    """    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float"""
+    ...
+  def ggml_unary(ctx: ffi.CData, a: ffi.CData, op: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_unary(
+                struct ggml_context * ctx,
+                 struct ggml_tensor * a,
+                 enum ggml_unary_op op);
+    """
+    ...
+  def ggml_unary_inplace(ctx: ffi.CData, a: ffi.CData, op: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_unary_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_unary_op op);
+    """
+    ...
+  def ggml_used_mem(ctx: ffi.CData) -> int:
+    """    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);"""
+    ...
+  def ggml_vec_dot_q2_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
+    """
+    Dot product
+
+    void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+    """
+    ...
+  def ggml_vec_dot_q3_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
+    """void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
+    ...
+  def ggml_vec_dot_q4_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
+    """void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
+    ...
+  def ggml_vec_dot_q5_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
+    """void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
+    ...
+  def ggml_vec_dot_q6_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
+    """void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
+    ...
+  def ggml_view_1d(ctx: ffi.CData, a: ffi.CData, ne0: int, offset: int) -> ffi.CData:
+    """
+    offset in bytes
+
+        GGML_API struct ggml_tensor * ggml_view_1d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0,
+                size_t                offset);
+    """
+    ...
+  def ggml_view_2d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, nb1: int, offset: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_view_2d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0,
+                int64_t               ne1,
+                size_t                nb1, // row stride in bytes
+                size_t                offset);
+    """
+    ...
+  def ggml_view_3d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, nb1: int, nb2: int, offset: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_view_3d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0,
+                int64_t               ne1,
+                int64_t               ne2,
+                size_t                nb1, // row   stride in bytes
+                size_t                nb2, // slice stride in bytes
+                size_t                offset);
+    """
+    ...
+  def ggml_view_4d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, ne3: int, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
+    """
+        GGML_API struct ggml_tensor * ggml_view_4d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int64_t               ne0,
+                int64_t               ne1,
+                int64_t               ne2,
+                int64_t               ne3,
+                size_t                nb1, // row   stride in bytes
+                size_t                nb2, // slice stride in bytes
+                size_t                nb3,
+                size_t                offset);
+    """
+    ...
+  def ggml_view_tensor(ctx: ffi.CData, src: ffi.CData) -> ffi.CData:
+    """    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);"""
+    ...
+  def ggml_win_part(ctx: ffi.CData, a: ffi.CData, w: int) -> ffi.CData:
+    """
+    partition into non-overlapping windows with padding if needed
+    example:
+    a:   768   64   64    1
+    w:    14
+    res: 768   14   14    25
+    used in sam
+
+        GGML_API struct ggml_tensor * ggml_win_part(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   w);
+    """
+    ...
+  def ggml_win_unpart(ctx: ffi.CData, a: ffi.CData, w0: int, h0: int, w: int) -> ffi.CData:
+    """
+    reverse of ggml_win_part
+    used in sam
+
+        GGML_API struct ggml_tensor * ggml_win_unpart(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                int                   w0,
+                int                   h0,
+                int                   w);
+    """
+    ...
+  def gguf_add_tensor(ctx: ffi.CData, tensor: ffi.CData) -> None:
+    """
+    manage tensor info
+
+        GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    """
+    ...
+  def gguf_find_key(ctx: ffi.CData, key: ffi.CData) -> int:
+    """    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);"""
+    ...
+  def gguf_find_tensor(ctx: ffi.CData, name: ffi.CData) -> int:
+    """    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);"""
+    ...
+  def gguf_free(ctx: ffi.CData) -> None:
+    """    GGML_API void gguf_free(struct gguf_context * ctx);"""
+    ...
+  def gguf_get_alignment(ctx: ffi.CData) -> int:
+    """    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);"""
+    ...
+  def gguf_get_arr_data(ctx: ffi.CData, i: int) -> ffi.CData:
+    """    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_arr_n(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_arr_str(ctx: ffi.CData, key_id: int, i: int) -> ffi.CData:
+    """    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);"""
+    ...
+  def gguf_get_arr_type(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_data(ctx: ffi.CData) -> ffi.CData:
+    """    GGML_API void * gguf_get_data       (struct gguf_context * ctx);"""
+    ...
+  def gguf_get_data_offset(ctx: ffi.CData) -> int:
+    """    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);"""
+    ...
+  def gguf_get_key(ctx: ffi.CData, i: int) -> ffi.CData:
+    """    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_kv_type(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_meta_data(ctx: ffi.CData, data: ffi.CData) -> None:
+    """    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);"""
+    ...
+  def gguf_get_meta_size(ctx: ffi.CData) -> int:
+    """
+    get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+
+        GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+    """
+    ...
+  def gguf_get_n_kv(ctx: ffi.CData) -> int:
+    """    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);"""
+    ...
+  def gguf_get_n_tensors(ctx: ffi.CData) -> int:
+    """    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);"""
+    ...
+  def gguf_get_tensor_name(ctx: ffi.CData, i: int) -> ffi.CData:
+    """    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_tensor_offset(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_bool(ctx: ffi.CData, i: int) -> bool:
+    """    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_f32(ctx: ffi.CData, i: int) -> float:
+    """    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_i16(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_i32(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_i8(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_str(ctx: ffi.CData, i: int) -> ffi.CData:
+    """    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_u16(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_u32(ctx: ffi.CData, i: int) -> int:
+    """    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);"""
+    ...
+  def gguf_get_val_u8(ctx: ffi.CData, i: int) -> int:
+    """
+    results are undefined if the wrong type is used for the key
+
+        GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
+    """
+    ...
+  def gguf_get_version(ctx: ffi.CData) -> int:
+    """    GGML_API int    gguf_get_version    (struct gguf_context * ctx);"""
+    ...
+  def gguf_init_empty() -> ffi.CData:
+    """    GGML_API struct gguf_context * gguf_init_empty(void);"""
+    ...
+  def gguf_init_from_file(fname: ffi.CData, params: ffi.CData) -> ffi.CData:
+    """    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);"""
+    ...
+  def gguf_set_arr_data(ctx: ffi.CData, key: ffi.CData, type: int, data: ffi.CData, n: int) -> None:
+    """    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);"""
+    ...
+  def gguf_set_arr_str(ctx: ffi.CData, key: ffi.CData, data: ffi.CData, n: int) -> None:
+    """    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);"""
+    ...
+  def gguf_set_kv(ctx: ffi.CData, src: ffi.CData) -> None:
+    """
+    set or add KV pairs from another context
+
+        GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+    """
+    ...
+  def gguf_set_tensor_data(ctx: ffi.CData, name: ffi.CData, data: ffi.CData, size: int) -> None:
+    """    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);"""
+    ...
+  def gguf_set_tensor_type(ctx: ffi.CData, name: ffi.CData, type: int) -> None:
+    """    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);"""
+    ...
+  def gguf_set_val_bool(ctx: ffi.CData, key: ffi.CData, val: bool) -> None:
+    """    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);"""
+    ...
+  def gguf_set_val_f32(ctx: ffi.CData, key: ffi.CData, val: float) -> None:
+    """    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);"""
+    ...
+  def gguf_set_val_i16(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
+    """    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);"""
+    ...
+  def gguf_set_val_i32(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
+    """    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);"""
+    ...
+  def gguf_set_val_i8(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
+    """    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);"""
+    ...
+  def gguf_set_val_str(ctx: ffi.CData, key: ffi.CData, val: ffi.CData) -> None:
+    """    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);"""
+    ...
+  def gguf_set_val_u16(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
+    """    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);"""
+    ...
+  def gguf_set_val_u32(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
+    """    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);"""
+    ...
+  def gguf_set_val_u8(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
+    """
+    overrides existing values or adds a new one
+
+        GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
+    """
+    ...
+  def gguf_type_name(type: int) -> ffi.CData:
+    """    GGML_API const char * gguf_type_name(enum gguf_type type);"""
+    ...
+  def gguf_write_to_file(ctx: ffi.CData, fname: ffi.CData, only_meta: bool) -> None:
+    """
+    write the entire context to a binary file
+
+        GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+    """
+    ...
+  def quantize_row_q2_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);"""
+    ...
+  def quantize_row_q2_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """
+    Quantization
+
+    void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+    """
+    ...
+  def quantize_row_q3_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);"""
+    ...
+  def quantize_row_q3_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);"""
+    ...
+  def quantize_row_q4_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);"""
+    ...
+  def quantize_row_q4_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);"""
+    ...
+  def quantize_row_q5_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);"""
+    ...
+  def quantize_row_q5_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);"""
+    ...
+  def quantize_row_q6_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);"""
+    ...
+  def quantize_row_q6_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);"""
+    ...
+  def quantize_row_q8_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);"""
+    ...
+  def quantize_row_q8_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
+    """void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);"""
+    ...
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/examples/python/ggml/cffi.py b/stable-diffusion.cpp/ggml/examples/python/ggml/cffi.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b65ff6ff501656fcb2f00f5b5db4d0943f69fd4
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/ggml/cffi.py
@@ -0,0 +1,11 @@
+# auto-generated file
+import _cffi_backend
+
+ffi = _cffi_backend.FFI('ggml.cffi',
+    _version = 0x2601,
+    _types = b'\x00\x00\xB6\x0D\x00\x00\x09\x0B\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x2F\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x31\x03\x00\x04\x3D\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x32\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x34\x03\x00\x03\xFE\x03\x00\x04\x53\x03\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x3D\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x3E\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x00\x0F\x00\x02\xD0\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x04\x0B\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x0B\x0B\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x04\x38\x03\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x00\x44\x11\x00\x00\x08\x11\x00\x04\x30\x03\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x00\x20\x09\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x01\x0D\x00\x00\x01\x0B\x00\x00\x00\x0F\x00\x01\x14\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x34\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x02\x7E\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x06\x01\x00\x00\x00\x0F\x00\x04\x18\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x02\xE9\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x4B\x11\x00\x04\x33\x03\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x04\x35\x03\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x00\x0F\x00\x00\xDB\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xDB\x0D\x00\x00\x00\x0F\x00\x03\xB0\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x03\xB5\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x04\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x04\x0D\x00\x00\x10\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x4B\x0D\x00\x00\x0B\x11\x00\x00\x00\x0F\x00\x00\x4B\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x0F\x11\x00\x00\x0B\x03\x00\x00\xB0\x11\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x0B\x11\x00\x00\x4B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x0B\x0D\x00\x00\x1B\x09\x00\x00\x00\x0F\x00\x04\x33\x0D\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0E\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x7F\x0D\x00\x00\x00\x0F\x00\x00\x50\x0D\x00\x00\x07\x0B\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x4B\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x07\x01\x00\x00\xDB\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x05\x0B\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x01\x01\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0A\x0B\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x5C\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x62\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x02\xD8\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x4F\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x54\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x02\xD3\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x03\x44\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x03\x48\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0F\x11\x00\x00\x01\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x21\x0D\x00\x00\x0F\x11\x00\x00\x24\x09\x00\x00\x00\x0F\x00\x00\x21\x0D\x00\x00\x00\x0F\x00\x03\xBA\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x03\xBF\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x01\x11\x00\x00\xF4\x03\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\xDB\x03\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x02\x35\x11\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x02\x39\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x04\x11\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x0B\x11\x00\x00\x21\x09\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x04\x32\x03\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x00\x0F\x00\x00\x6C\x0D\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x6C\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x02\x4B\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x02\xE1\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xF8\x03\x00\x00\xF4\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xF9\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFA\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFB\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFC\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFD\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0F\x11\x00\x00\x0F\x11\x00\x00\x07\x01\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xF8\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xF9\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFA\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFB\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFC\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFD\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x6C\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x03\xFE\x03\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x02\x35\x11\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x04\x53\x03\x00\x02\xE1\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x22\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x4B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x04\x30\x03\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xF8\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xF8\x11\x00\x02\xF8\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x44\x11\x00\x00\x50\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x4B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x02\xE9\x11\x00\x02\xE9\x11\x00\x02\xE9\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x04\x37\x03\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x10\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0F\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x34\x11\x00\x02\xE1\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x05\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x03\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x04\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x08\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x06\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x02\xE1\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x6C\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x10\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xE1\x11\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x00\x0F\x00\x00\x24\x03\x00\x00\x0D\x09\x00\x00\x0E\x09\x00\x00\x0F\x09\x00\x00\x10\x09\x00\x00\x11\x09\x00\x00\x12\x09\x00\x00\x13\x09\x00\x00\x14\x09\x00\x00\x04\x09\x00\x00\x05\x09\x00\x00\x06\x09\x00\x00\x07\x09\x00\x00\x08\x09\x00\x00\x09\x09\x00\x00\x0A\x09\x00\x00\x02\x01\x00\x03\xFE\x05\x00\x00\x00\x80\x00\x03\xFE\x05\x00\x00\x00\x10\x00\x03\xFE\x05\x00\x00\x00\xC0\x00\x03\xFE\x05\x00\x00\x00\x25\x00\x03\xFE\x05\x00\x00\x00\x28\x00\x03\xFE\x05\x00\x00\x00\x04\x00\x03\xFE\x05\x00\x00\x00\x38\x00\x03\xFE\x05\x00\x00\x00\x40\x00\x03\xFE\x05\x00\x00\x1F\xF0\x00\x03\xFE\x05\x00\x00\x00\x08\x00\x00\x00\x0B\x00\x00\x02\x0B\x00\x00\x03\x0B\x00\x00\x06\x0B\x00\x00\x08\x0B\x00\x00\x0B\x09\x00\x00\x22\x05\x00\x00\x10\x00\x00\x00\x22\x05\x00\x00\x00\x08\x00\x00\x0F\x01\x00\x00\xDB\x05\x00\x00\x00\x04\x00\x00\x09\x01\x00\x03\xB0\x05\x00\x00\x00\x10\x00\x03\xB5\x05\x00\x00\x00\x10\x00\x03\xB5\x05\x00\x00\x01\x00\x00\x00\x00\x09\x00\x00\x01\x09\x00\x00\x02\x09\x00\x00\x03\x09\x00\x04\x2C\x03\x00\x00\x0C\x09\x00\x04\x2E\x03\x00\x00\x15\x09\x00\x00\x16\x09\x00\x00\x17\x09\x00\x00\x18\x09\x00\x00\x19\x09\x00\x00\x1A\x09\x00\x00\x1C\x09\x00\x00\x1D\x09\x00\x04\x37\x03\x00\x00\x1E\x09\x00\x00\x1F\x09\x00\x00\x08\x05\x00\x00\x10\x00\x00\x00\x08\x05\x00\x00\x00\x06\x00\x00\x22\x09\x00\x00\x23\x09\x00\x03\xBA\x03\x00\x03\xBA\x05\x00\x00\x00\x80\x00\x03\xBA\x05\x00\x00\x00\x0C\x00\x03\xBA\x05\x00\x00\x00\x10\x00\x03\xBA\x05\x00\x00\x00\x20\x00\x03\xBA\x05\x00\x00\x00\x40\x00\x00\x0C\x01\x00\x00\x11\x05\x00\x00\x00\x04\x00\x00\x10\x05\x00\x00\x20\x51\x00\x02\xC6\x03\x00\x02\xDE\x03\x00\x03\xE0\x03\x00\x03\xE7\x03\x00\x00\x00\x01',
+    _globals = (b'\xFF\xFF\xFF\x0BGGML_BACKEND_CPU',0,b'\xFF\xFF\xFF\x0BGGML_BACKEND_GPU',10,b'\xFF\xFF\xFF\x0BGGML_BACKEND_GPU_SPLIT',20,b'\xFF\xFF\xFF\x0BGGML_FTYPE_ALL_F32',0,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_F16',1,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q2_K',10,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q3_K',11,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_0',2,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_1',3,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_1_SOME_F16',4,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_K',12,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_0',8,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_1',9,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_K',13,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q6_K',14,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q8_0',7,b'\xFF\xFF\xFF\x0BGGML_FTYPE_UNKNOWN',-1,b'\xFF\xFF\xFF\x1FGGML_GRAPH_SIZE',164520,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_ARMIJO',0,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE',2,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_WOLFE',1,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_DEFAULT',1,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_FAIL',-128,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_INVALID_PARAMETERS',-124,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MAXIMUM_ITERATIONS',-125,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MAXIMUM_STEP',-126,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MINIMUM_STEP',-127,b'\xFF\xFF\xFF\x0BGGML_OBJECT_GRAPH',1,b'\xFF\xFF\xFF\x1FGGML_OBJECT_SIZE',32,b'\xFF\xFF\xFF\x0BGGML_OBJECT_TENSOR',0,b'\xFF\xFF\xFF\x0BGGML_OBJECT_WORK_BUFFER',2,b'\xFF\xFF\xFF\x0BGGML_OPT_ADAM',0,b'\xFF\xFF\xFF\x0BGGML_OPT_DID_NOT_CONVERGE',1,b'\xFF\xFF\xFF\x0BGGML_OPT_FAIL',4,b'\xFF\xFF\xFF\x0BGGML_OPT_INVALID_WOLFE',3,b'\xFF\xFF\xFF\x0BGGML_OPT_LBFGS',1,b'\xFF\xFF\xFF\x0BGGML_OPT_NO_CONTEXT',2,b'\xFF\xFF\xFF\x0BGGML_OPT_OK',0,b'\xFF\xFF\xFF\x0BGGML_OP_ACC',4,b'\xFF\xFF\xFF\x0BGGML_OP_ADD',2,b'\xFF\xFF\xFF\x0BGGML_OP_ADD1',3,b'\xFF\xFF\xFF\x0BGGML_OP_ALIBI',40,b'\xFF\xFF\xFF\x0BGGML_OP_ARGMAX',14,b'\xFF\xFF\xFF\x0BGGML_OP_CLAMP',41,b'\xFF\xFF\xFF\x0BGGML_OP_CONT',26,b'\xFF\xFF\xFF\x0BGGML_OP_CONV_1D',42,b'\xFF\xFF\xFF\x0BGGML_OP_CONV_2D',43,b'\xFF\xFF\xFF\x0BGGML_OP_COUNT',62,b'\xFF\xFF\xFF\x0BGGML_OP_CPY',25,b'\xFF\xFF\xFF\x0BGGML_OP_CROSS_ENTROPY_LOSS',60,b'\xFF\xFF\xFF\x0BGGML_OP_CROSS_ENTROPY_LOSS_BACK',61,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG',33,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG_MASK_INF',34,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG_MASK_ZERO',35,b'\xFF\xFF\xFF\x0BGGML_OP_DIV',7,b'\xFF\xFF\xFF\x0BGGML_OP_DUP',1,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_ATTN',46,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_ATTN_BACK',48,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_FF',47,b'\xFF\xFF\xFF\x0BGGML_OP_GET_ROWS',31,b'\xFF\xFF\xFF\x0BGGML_OP_GET_ROWS_BACK',32,b'\xFF\xFF\xFF\x0BGGML_OP_LOG',10,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_BINARY',53,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM1',57,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM1_F32',54,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM2',58,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM2_F32',55,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM3',59,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM3_F32',56,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_UNARY',52,b'\xFF\xFF\xFF\x0BGGML_OP_MEAN',13,b'\xFF\xFF\xFF\x0BGGML_OP_MUL',6,b'\xFF\xFF\xFF\x0BGGML_OP_MUL_MAT',21,b'\xFF\xFF\xFF\x0BGGML_OP_NONE',0,b'\xFF\xFF\xFF\x0BGGML_OP_NORM',18,b'\xFF\xFF\xFF\x0BGGML_OP_OUT_PROD',22,b'\xFF\xFF\xFF\x0BGGML_OP_PERMUTE',29,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_1D',44,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_2D',45,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_AVG',1,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_COUNT',2,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_MAX',0,b'\xFF\xFF\xFF\x0BGGML_OP_REPEAT',15,b'\xFF\xFF\xFF\x0BGGML_OP_REPEAT_BACK',16,b'\xFF\xFF\xFF\x0BGGML_OP_RESHAPE',27,b'\xFF\xFF\xFF\x0BGGML_OP_RMS_NORM',19,b'\xFF\xFF\xFF\x0BGGML_OP_RMS_NORM_BACK',20,b'\xFF\xFF\xFF\x0BGGML_OP_ROPE',38,b'\xFF\xFF\xFF\x0BGGML_OP_ROPE_BACK',39,b'\xFF\xFF\xFF\x0BGGML_OP_SCALE',23,b'\xFF\xFF\xFF\x0BGGML_OP_SET',24,b'\xFF\xFF\xFF\x0BGGML_OP_SILU_BACK',17,b'\xFF\xFF\xFF\x0BGGML_OP_SOFT_MAX',36,b'\xFF\xFF\xFF\x0BGGML_OP_SOFT_MAX_BACK',37,b'\xFF\xFF\xFF\x0BGGML_OP_SQR',8,b'\xFF\xFF\xFF\x0BGGML_OP_SQRT',9,b'\xFF\xFF\xFF\x0BGGML_OP_SUB',5,b'\xFF\xFF\xFF\x0BGGML_OP_SUM',11,b'\xFF\xFF\xFF\x0BGGML_OP_SUM_ROWS',12,b'\xFF\xFF\xFF\x0BGGML_OP_TRANSPOSE',30,b'\xFF\xFF\xFF\x0BGGML_OP_UNARY',51,b'\xFF\xFF\xFF\x0BGGML_OP_VIEW',28,b'\xFF\xFF\xFF\x0BGGML_OP_WIN_PART',49,b'\xFF\xFF\xFF\x0BGGML_OP_WIN_UNPART',50,b'\xFF\xFF\xFF\x0BGGML_TASK_COMPUTE',1,b'\xFF\xFF\xFF\x0BGGML_TASK_FINALIZE',2,b'\xFF\xFF\xFF\x0BGGML_TASK_INIT',0,b'\xFF\xFF\xFF\x1FGGML_TENSOR_SIZE',288,b'\xFF\xFF\xFF\x0BGGML_TYPE_COUNT',19,b'\xFF\xFF\xFF\x0BGGML_TYPE_F16',1,b'\xFF\xFF\xFF\x0BGGML_TYPE_F32',0,b'\xFF\xFF\xFF\x0BGGML_TYPE_I16',17,b'\xFF\xFF\xFF\x0BGGML_TYPE_I32',18,b'\xFF\xFF\xFF\x0BGGML_TYPE_I8',16,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q2_K',10,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q3_K',11,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_0',2,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_1',3,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_K',12,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_0',6,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_1',7,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_K',13,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q6_K',14,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_0',8,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_1',9,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_K',15,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_ABS',0,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_ELU',5,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_GELU',7,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_GELU_QUICK',8,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_NEG',2,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_RELU',6,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_SGN',1,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_SILU',9,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_STEP',3,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_TANH',4,b'\xFF\xFF\xFF\x0BGGUF_TYPE_ARRAY',9,b'\xFF\xFF\xFF\x0BGGUF_TYPE_BOOL',7,b'\xFF\xFF\xFF\x0BGGUF_TYPE_COUNT',10,b'\xFF\xFF\xFF\x0BGGUF_TYPE_FLOAT32',6,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT16',3,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT32',5,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT8',1,b'\xFF\xFF\xFF\x0BGGUF_TYPE_STRING',8,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT16',2,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT32',4,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT8',0,b'\x00\x02\x9A\x23__assert_rtn',0,b'\x00\x02\x7C\x23dequantize_row_q2_K',0,b'\x00\x02\x81\x23dequantize_row_q3_K',0,b'\x00\x02\x86\x23dequantize_row_q4_K',0,b'\x00\x02\x8B\x23dequantize_row_q5_K',0,b'\x00\x02\x90\x23dequantize_row_q6_K',0,b'\x00\x02\x95\x23dequantize_row_q8_K',0,b'\x00\x00\xFA\x23ggml_abs',0,b'\x00\x00\xFA\x23ggml_abs_inplace',0,b'\x00\x01\xDD\x23ggml_acc',0,b'\x00\x01\xDD\x23ggml_acc_inplace',0,b'\x00\x01\x84\x23ggml_add',0,b'\x00\x01\x84\x23ggml_add1',0,b'\x00\x01\x84\x23ggml_add1_inplace',0,b'\x00\x01\x84\x23ggml_add_inplace',0,b'\x00\x01\x26\x23ggml_alibi',0,b'\x00\x02\xEC\x23ggml_allocr_alloc',0,b'\x00\x02\x42\x23ggml_allocr_alloc_graph',0,b'\x00\x02\xE4\x23ggml_allocr_free',0,b'\x00\x00\x03\x23ggml_allocr_is_measure',0,b'\x00\x00\xA2\x23ggml_allocr_new',0,b'\x00\x00\x9F\x23ggml_allocr_new_measure',0,b'\x00\x02\xE4\x23ggml_allocr_reset',0,b'\x00\x02\xE7\x23ggml_allocr_set_parse_seq',0,b'\x00\x00\x17\x23ggml_are_same_shape',0,b'\x00\x00\xFA\x23ggml_argmax',0,b'\x00\x00\x74\x23ggml_blck_size',0,b'\x00\x00\xB3\x23ggml_build_backward',0,b'\x00\x00\xB8\x23ggml_build_forward',0,b'\x00\x00\xAA\x23ggml_build_forward_ctx',0,b'\x00\x02\xF3\x23ggml_build_forward_expand',0,b'\x00\x00\x1B\x23ggml_cl_can_mul_mat',0,b'\x00\x03\x6B\x23ggml_cl_free_data',0,b'\x00\x03\xE0\x23ggml_cl_host_free',0,b'\x00\x02\x72\x23ggml_cl_host_malloc',0,b'\x00\x03\xEC\x23ggml_cl_init',0,b'\x00\x03\x78\x23ggml_cl_mul',0,b'\x00\x03\x7D\x23ggml_cl_mul_mat',0,b'\x00\x02\x54\x23ggml_cl_mul_mat_get_wsize',0,b'\x00\x03\xE3\x23ggml_cl_transform_tensor',0,b'\x00\x01\x1B\x23ggml_clamp',0,b'\x00\x00\xFA\x23ggml_cont',0,b'\x00\x00\xFA\x23ggml_cont_inplace',0,b'\x00\x01\x90\x23ggml_conv_1d',0,b'\x00\x01\x89\x23ggml_conv_1d_ph',0,b'\x00\x01\x98\x23ggml_conv_2d',0,b'\x00\x00\x90\x23ggml_cpu_has_arm_fma',0,b'\x00\x00\x90\x23ggml_cpu_has_avx',0,b'\x00\x00\x90\x23ggml_cpu_has_avx2',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512_vbmi',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512_vnni',0,b'\x00\x00\x90\x23ggml_cpu_has_blas',0,b'\x00\x00\x90\x23ggml_cpu_has_clblast',0,b'\x00\x00\x90\x23ggml_cpu_has_cublas',0,b'\x00\x00\x90\x23ggml_cpu_has_f16c',0,b'\x00\x00\x90\x23ggml_cpu_has_fma',0,b'\x00\x00\x90\x23ggml_cpu_has_fp16_va',0,b'\x00\x00\x90\x23ggml_cpu_has_gpublas',0,b'\x00\x00\x90\x23ggml_cpu_has_neon',0,b'\x00\x00\x90\x23ggml_cpu_has_sse3',0,b'\x00\x00\x90\x23ggml_cpu_has_vsx',0,b'\x00\x00\x90\x23ggml_cpu_has_wasm_simd',0,b'\x00\x01\x84\x23ggml_cpy',0,b'\x00\x01\x84\x23ggml_cpy_inplace',0,b'\x00\x01\x84\x23ggml_cross_entropy_loss',0,b'\x00\x01\xA3\x23ggml_cross_entropy_loss_back',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers_force_inplace',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers_no_scratch',0,b'\x00\x00\x1B\x23ggml_cuda_can_mul_mat',0,b'\x00\x00\x06\x23ggml_cuda_compute_forward',0,b'\x00\x03\x41\x23ggml_cuda_free_data',0,b'\x00\x03\xEC\x23ggml_cuda_free_scratch',0,b'\x00\x00\x90\x23ggml_cuda_get_device_count',0,b'\x00\x02\xCE\x23ggml_cuda_get_device_description',0,b'\x00\x03\xE0\x23ggml_cuda_host_free',0,b'\x00\x02\x72\x23ggml_cuda_host_malloc',0,b'\x00\x02\xCB\x23ggml_cuda_set_main_device',0,b'\x00\x02\x79\x23ggml_cuda_set_mul_mat_q',0,b'\x00\x03\xD8\x23ggml_cuda_set_scratch_size',0,b'\x00\x02\xA0\x23ggml_cuda_set_tensor_split',0,b'\x00\x03\xE3\x23ggml_cuda_transform_tensor',0,b'\x00\x00\x95\x23ggml_cycles',0,b'\x00\x00\x95\x23ggml_cycles_per_ms',0,b'\x00\x00\xFA\x23ggml_diag',0,b'\x00\x01\x21\x23ggml_diag_mask_inf',0,b'\x00\x01\x21\x23ggml_diag_mask_inf_inplace',0,b'\x00\x01\x21\x23ggml_diag_mask_zero',0,b'\x00\x01\x21\x23ggml_diag_mask_zero_inplace',0,b'\x00\x01\x84\x23ggml_div',0,b'\x00\x01\x84\x23ggml_div_inplace',0,b'\x00\x00\xFA\x23ggml_dup',0,b'\x00\x00\xFA\x23ggml_dup_inplace',0,b'\x00\x02\x0B\x23ggml_dup_tensor',0,b'\x00\x02\x4D\x23ggml_element_size',0,b'\x00\x00\xFA\x23ggml_elu',0,b'\x00\x00\xFA\x23ggml_elu_inplace',0,b'\x00\x01\xA9\x23ggml_flash_attn',0,b'\x00\x01\xB0\x23ggml_flash_attn_back',0,b'\x00\x01\xB8\x23ggml_flash_ff',0,b'\x00\x02\x16\x23ggml_format_name',0,b'\x00\x00\x6B\x23ggml_fp16_to_fp32',0,b'\x00\x03\xDB\x23ggml_fp16_to_fp32_row',0,b'\x00\x02\x62\x23ggml_fp32_to_fp16',0,b'\x00\x02\xC1\x23ggml_fp32_to_fp16_row',0,b'\x00\x03\x03\x23ggml_free',0,b'\x00\x00\x53\x23ggml_ftype_to_ggml_type',0,b'\x00\x00\xFA\x23ggml_gelu',0,b'\x00\x00\xFA\x23ggml_gelu_inplace',0,b'\x00\x00\xFA\x23ggml_gelu_quick',0,b'\x00\x00\xFA\x23ggml_gelu_quick_inplace',0,b'\x00\x02\x6C\x23ggml_get_data',0,b'\x00\x00\x5D\x23ggml_get_data_f32',0,b'\x00\x00\x63\x23ggml_get_f32_1d',0,b'\x00\x00\x81\x23ggml_get_i32_1d',0,b'\x00\x02\x4A\x23ggml_get_max_tensor_size',0,b'\x00\x02\x69\x23ggml_get_mem_buffer',0,b'\x00\x02\x4A\x23ggml_get_mem_size',0,b'\x00\x00\x36\x23ggml_get_name',0,b'\x00\x00\x0A\x23ggml_get_no_alloc',0,b'\x00\x01\x84\x23ggml_get_rows',0,b'\x00\x01\xA3\x23ggml_get_rows_back',0,b'\x00\x00\xCE\x23ggml_get_tensor',0,b'\x00\x00\x56\x23ggml_get_unary_op',0,b'\x00\x00\x77\x23ggml_graph_compute',0,b'\x00\x03\x0A\x23ggml_graph_compute_with_ctx',0,b'\x00\x02\xFE\x23ggml_graph_dump_dot',0,b'\x00\x02\xFA\x23ggml_graph_export',0,b'\x00\x00\xCA\x23ggml_graph_get_tensor',0,b'\x00\x00\xAE\x23ggml_graph_import',0,b'\x00\x02\x60\x23ggml_graph_overhead',0,b'\x00\x00\xBE\x23ggml_graph_plan',0,b'\x00\x02\xF7\x23ggml_graph_print',0,b'\x00\x02\xF0\x23ggml_graph_reset',0,b'\x00\x00\xBB\x23ggml_init',0,b'\x00\x03\xEC\x23ggml_init_cublas',0,b'\x00\x00\x6E\x23ggml_internal_get_type_traits',0,b'\x00\x00\x14\x23ggml_is_contiguous',0,b'\x00\x00\x27\x23ggml_is_numa',0,b'\x00\x00\x14\x23ggml_is_permuted',0,b'\x00\x00\x00\x23ggml_is_quantized',0,b'\x00\x00\x14\x23ggml_is_transposed',0,b'\x00\x00\xFA\x23ggml_log',0,b'\x00\x00\xFA\x23ggml_log_inplace',0,b'\x00\x01\xE6\x23ggml_map_binary_f32',0,b'\x00\x01\xE6\x23ggml_map_binary_inplace_f32',0,b'\x00\x02\x04\x23ggml_map_custom1',0,b'\x00\x01\xFF\x23ggml_map_custom1_f32',0,b'\x00\x02\x04\x23ggml_map_custom1_inplace',0,b'\x00\x01\xFF\x23ggml_map_custom1_inplace_f32',0,b'\x00\x01\xF2\x23ggml_map_custom2',0,b'\x00\x01\xEC\x23ggml_map_custom2_f32',0,b'\x00\x01\xF2\x23ggml_map_custom2_inplace',0,b'\x00\x01\xEC\x23ggml_map_custom2_inplace_f32',0,b'\x00\x01\xC7\x23ggml_map_custom3',0,b'\x00\x01\xC0\x23ggml_map_custom3_f32',0,b'\x00\x01\xC7\x23ggml_map_custom3_inplace',0,b'\x00\x01\xC0\x23ggml_map_custom3_inplace_f32',0,b'\x00\x01\xFA\x23ggml_map_unary_f32',0,b'\x00\x01\xFA\x23ggml_map_unary_inplace_f32',0,b'\x00\x00\xFA\x23ggml_mean',0,b'\x00\x00\x0D\x23ggml_metal_add_buffer',0,b'\x00\x03\x1C\x23ggml_metal_free',0,b'\x00\x00\x71\x23ggml_metal_get_concur_list',0,b'\x00\x03\x2C\x23ggml_metal_get_tensor',0,b'\x00\x03\x23\x23ggml_metal_graph_compute',0,b'\x00\x03\x27\x23ggml_metal_graph_find_concurrency',0,b'\x00\x03\xE0\x23ggml_metal_host_free',0,b'\x00\x02\x72\x23ggml_metal_host_malloc',0,b'\x00\x00\x7B\x23ggml_metal_if_optimized',0,b'\x00\x00\xC2\x23ggml_metal_init',0,b'\x00\x03\x1F\x23ggml_metal_set_n_cb',0,b'\x00\x03\x2C\x23ggml_metal_set_tensor',0,b'\x00\x03\xEC\x23ggml_mpi_backend_free',0,b'\x00\x03\xEC\x23ggml_mpi_backend_init',0,b'\x00\x03\x33\x23ggml_mpi_eval_init',0,b'\x00\x03\x30\x23ggml_mpi_free',0,b'\x00\x03\x39\x23ggml_mpi_graph_compute_post',0,b'\x00\x03\x39\x23ggml_mpi_graph_compute_pre',0,b'\x00\x00\xC5\x23ggml_mpi_init',0,b'\x00\x00\x7E\x23ggml_mpi_rank',0,b'\x00\x01\x84\x23ggml_mul',0,b'\x00\x01\x84\x23ggml_mul_inplace',0,b'\x00\x01\x84\x23ggml_mul_mat',0,b'\x00\x02\x4D\x23ggml_nbytes',0,b'\x00\x02\x4D\x23ggml_nbytes_pad',0,b'\x00\x02\x50\x23ggml_nbytes_split',0,b'\x00\x00\xFA\x23ggml_neg',0,b'\x00\x00\xFA\x23ggml_neg_inplace',0,b'\x00\x00\x92\x23ggml_nelements',0,b'\x00\x00\xF2\x23ggml_new_f32',0,b'\x00\x00\xA7\x23ggml_new_graph',0,b'\x00\x00\xF6\x23ggml_new_i32',0,b'\x00\x00\xD2\x23ggml_new_tensor',0,b'\x00\x00\xD8\x23ggml_new_tensor_1d',0,b'\x00\x00\xDD\x23ggml_new_tensor_2d',0,b'\x00\x00\xE3\x23ggml_new_tensor_3d',0,b'\x00\x00\xEA\x23ggml_new_tensor_4d',0,b'\x00\x00\xFA\x23ggml_norm',0,b'\x00\x00\xFA\x23ggml_norm_inplace',0,b'\x00\x00\x92\x23ggml_nrows',0,b'\x00\x03\xEC\x23ggml_numa_init',0,b'\x00\x00\x2D\x23ggml_op_name',0,b'\x00\x00\x2D\x23ggml_op_symbol',0,b'\x00\x00\x4E\x23ggml_opt',0,b'\x00\x00\xC7\x23ggml_opt_default_params',0,b'\x00\x03\x0F\x23ggml_opt_init',0,b'\x00\x00\x42\x23ggml_opt_resume',0,b'\x00\x00\x47\x23ggml_opt_resume_g',0,b'\x00\x01\x84\x23ggml_out_prod',0,b'\x00\x01\x34\x23ggml_permute',0,b'\x00\x00\xFE\x23ggml_pool_1d',0,b'\x00\x01\x06\x23ggml_pool_2d',0,b'\x00\x03\x3E\x23ggml_print_object',0,b'\x00\x03\x19\x23ggml_print_objects',0,b'\x00\x02\x33\x23ggml_quantize_chunk',0,b'\x00\x02\x3B\x23ggml_quantize_q2_K',0,b'\x00\x02\x3B\x23ggml_quantize_q3_K',0,b'\x00\x02\x3B\x23ggml_quantize_q4_0',0,b'\x00\x02\x3B\x23ggml_quantize_q4_1',0,b'\x00\x02\x3B\x23ggml_quantize_q4_K',0,b'\x00\x02\x3B\x23ggml_quantize_q5_0',0,b'\x00\x02\x3B\x23ggml_quantize_q5_1',0,b'\x00\x02\x3B\x23ggml_quantize_q5_K',0,b'\x00\x02\x3B\x23ggml_quantize_q6_K',0,b'\x00\x02\x3B\x23ggml_quantize_q8_0',0,b'\x00\x00\xFA\x23ggml_relu',0,b'\x00\x00\xFA\x23ggml_relu_inplace',0,b'\x00\x01\x84\x23ggml_repeat',0,b'\x00\x01\x84\x23ggml_repeat_back',0,b'\x00\x01\x84\x23ggml_reshape',0,b'\x00\x01\x46\x23ggml_reshape_1d',0,b'\x00\x01\x4B\x23ggml_reshape_2d',0,b'\x00\x01\x51\x23ggml_reshape_3d',0,b'\x00\x01\x58\x23ggml_reshape_4d',0,b'\x00\x01\x16\x23ggml_rms_norm',0,b'\x00\x01\x84\x23ggml_rms_norm_back',0,b'\x00\x01\x16\x23ggml_rms_norm_inplace',0,b'\x00\x01\x34\x23ggml_rope',0,b'\x00\x01\x34\x23ggml_rope_back',0,b'\x00\x01\x3C\x23ggml_rope_custom',0,b'\x00\x01\x3C\x23ggml_rope_custom_inplace',0,b'\x00\x01\x34\x23ggml_rope_inplace',0,b'\x00\x01\x84\x23ggml_scale',0,b'\x00\x01\x84\x23ggml_scale_inplace',0,b'\x00\x01\xDD\x23ggml_set',0,b'\x00\x01\xD0\x23ggml_set_1d',0,b'\x00\x01\xD0\x23ggml_set_1d_inplace',0,b'\x00\x01\xD6\x23ggml_set_2d',0,b'\x00\x01\xD6\x23ggml_set_2d_inplace',0,b'\x00\x02\x1A\x23ggml_set_f32',0,b'\x00\x03\x6E\x23ggml_set_f32_1d',0,b'\x00\x02\x1E\x23ggml_set_i32',0,b'\x00\x03\x73\x23ggml_set_i32_1d',0,b'\x00\x01\xDD\x23ggml_set_inplace',0,b'\x00\x02\x12\x23ggml_set_name',0,b'\x00\x03\x06\x23ggml_set_no_alloc',0,b'\x00\x03\x15\x23ggml_set_param',0,b'\x00\x02\x46\x23ggml_set_scratch',0,b'\x00\x02\x0F\x23ggml_set_zero',0,b'\x00\x00\xFA\x23ggml_sgn',0,b'\x00\x00\xFA\x23ggml_sgn_inplace',0,b'\x00\x00\xFA\x23ggml_silu',0,b'\x00\x01\x84\x23ggml_silu_back',0,b'\x00\x00\xFA\x23ggml_silu_inplace',0,b'\x00\x00\xFA\x23ggml_soft_max',0,b'\x00\x01\x84\x23ggml_soft_max_back',0,b'\x00\x01\x84\x23ggml_soft_max_back_inplace',0,b'\x00\x00\xFA\x23ggml_soft_max_inplace',0,b'\x00\x00\xFA\x23ggml_sqr',0,b'\x00\x00\xFA\x23ggml_sqr_inplace',0,b'\x00\x00\xFA\x23ggml_sqrt',0,b'\x00\x00\xFA\x23ggml_sqrt_inplace',0,b'\x00\x00\xFA\x23ggml_step',0,b'\x00\x00\xFA\x23ggml_step_inplace',0,b'\x00\x01\x84\x23ggml_sub',0,b'\x00\x01\x84\x23ggml_sub_inplace',0,b'\x00\x00\xFA\x23ggml_sum',0,b'\x00\x00\xFA\x23ggml_sum_rows',0,b'\x00\x00\xFA\x23ggml_tanh',0,b'\x00\x00\xFA\x23ggml_tanh_inplace',0,b'\x00\x02\x60\x23ggml_tensor_overhead',0,b'\x00\x03\xEC\x23ggml_time_init',0,b'\x00\x00\x95\x23ggml_time_ms',0,b'\x00\x00\x95\x23ggml_time_us',0,b'\x00\x00\xFA\x23ggml_transpose',0,b'\x00\x00\x30\x23ggml_type_name',0,b'\x00\x02\x30\x23ggml_type_size',0,b'\x00\x00\x60\x23ggml_type_sizef',0,b'\x00\x01\x11\x23ggml_unary',0,b'\x00\x01\x11\x23ggml_unary_inplace',0,b'\x00\x02\x4A\x23ggml_used_mem',0,b'\x00\x02\xDE\x23ggml_vec_dot_q2_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q3_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q4_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q5_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q6_K_q8_K',0,b'\x00\x01\x7E\x23ggml_view_1d',0,b'\x00\x01\x76\x23ggml_view_2d',0,b'\x00\x01\x6C\x23ggml_view_3d',0,b'\x00\x01\x60\x23ggml_view_4d',0,b'\x00\x02\x0B\x23ggml_view_tensor',0,b'\x00\x01\x21\x23ggml_win_part',0,b'\x00\x01\x2D\x23ggml_win_unpart',0,b'\x00\x03\xCC\x23gguf_add_tensor',0,b'\x00\x00\x88\x23gguf_find_key',0,b'\x00\x00\x88\x23gguf_find_tensor',0,b'\x00\x03\x84\x23gguf_free',0,b'\x00\x02\x59\x23gguf_get_alignment',0,b'\x00\x02\x75\x23gguf_get_arr_data',0,b'\x00\x00\x8C\x23gguf_get_arr_n',0,b'\x00\x00\x3D\x23gguf_get_arr_str',0,b'\x00\x00\x59\x23gguf_get_arr_type',0,b'\x00\x02\x6F\x23gguf_get_data',0,b'\x00\x02\x59\x23gguf_get_data_offset',0,b'\x00\x00\x39\x23gguf_get_key',0,b'\x00\x00\x59\x23gguf_get_kv_type',0,b'\x00\x03\xD4\x23gguf_get_meta_data',0,b'\x00\x02\x59\x23gguf_get_meta_size',0,b'\x00\x00\x85\x23gguf_get_n_kv',0,b'\x00\x00\x85\x23gguf_get_n_tensors',0,b'\x00\x00\x29\x23gguf_get_tensor_name',0,b'\x00\x02\x5C\x23gguf_get_tensor_offset',0,b'\x00\x00\x20\x23gguf_get_val_bool',0,b'\x00\x00\x67\x23gguf_get_val_f32',0,b'\x00\x00\x97\x23gguf_get_val_i16',0,b'\x00\x00\x8C\x23gguf_get_val_i32',0,b'\x00\x00\x9B\x23gguf_get_val_i8',0,b'\x00\x00\x39\x23gguf_get_val_str',0,b'\x00\x02\x65\x23gguf_get_val_u16',0,b'\x00\x02\x2C\x23gguf_get_val_u32',0,b'\x00\x02\x28\x23gguf_get_val_u8',0,b'\x00\x00\x85\x23gguf_get_version',0,b'\x00\x02\x26\x23gguf_init_empty',0,b'\x00\x02\x22\x23gguf_init_from_file',0,b'\x00\x03\x9C\x23gguf_set_arr_data',0,b'\x00\x03\x8C\x23gguf_set_arr_str',0,b'\x00\x03\xD0\x23gguf_set_kv',0,b'\x00\x03\xC6\x23gguf_set_tensor_data',0,b'\x00\x03\x97\x23gguf_set_tensor_type',0,b'\x00\x03\x87\x23gguf_set_val_bool',0,b'\x00\x03\xA3\x23gguf_set_val_f32',0,b'\x00\x03\xAD\x23gguf_set_val_i16',0,b'\x00\x03\xA8\x23gguf_set_val_i32',0,b'\x00\x03\xB2\x23gguf_set_val_i8',0,b'\x00\x03\x92\x23gguf_set_val_str',0,b'\x00\x03\xC1\x23gguf_set_val_u16',0,b'\x00\x03\xBC\x23gguf_set_val_u32',0,b'\x00\x03\xB7\x23gguf_set_val_u8',0,b'\x00\x00\x33\x23gguf_type_name',0,b'\x00\x03\x87\x23gguf_write_to_file',0,b'\x00\x02\xC6\x23quantize_row_q2_K',0,b'\x00\x02\xA3\x23quantize_row_q2_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q3_K',0,b'\x00\x02\xA8\x23quantize_row_q3_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q4_K',0,b'\x00\x02\xAD\x23quantize_row_q4_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q5_K',0,b'\x00\x02\xB2\x23quantize_row_q5_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q6_K',0,b'\x00\x02\xB7\x23quantize_row_q6_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q8_K',0,b'\x00\x02\xBC\x23quantize_row_q8_K_reference',0),
+    _struct_unions = ((b'\x00\x00\x04\x27\x00\x00\x00\x02$1',b'\x00\x00\x22\x11n_iter',b'\x00\x00\xF4\x11sched',b'\x00\x00\xF4\x11decay',b'\x00\x00\xF4\x11alpha',b'\x00\x00\xF4\x11beta1',b'\x00\x00\xF4\x11beta2',b'\x00\x00\xF4\x11eps',b'\x00\x00\xF4\x11eps_f',b'\x00\x00\xF4\x11eps_g'),(b'\x00\x00\x04\x28\x00\x00\x00\x02$2',b'\x00\x00\x22\x11m',b'\x00\x00\x22\x11n_iter',b'\x00\x00\x22\x11max_linesearch',b'\x00\x00\xF4\x11eps',b'\x00\x00\xF4\x11ftol',b'\x00\x00\xF4\x11wolfe',b'\x00\x00\xF4\x11min_step',b'\x00\x00\xF4\x11max_step',b'\x00\x04\x14\x11linesearch'),(b'\x00\x00\x04\x29\x00\x00\x00\x02$3',b'\x00\x00\x08\x11x',b'\x00\x00\x08\x11g1',b'\x00\x00\x08\x11g2',b'\x00\x00\x08\x11m',b'\x00\x00\x08\x11v',b'\x00\x00\x08\x11mh',b'\x00\x00\x08\x11vh',b'\x00\x00\x08\x11pf',b'\x00\x00\xF4\x11fx_best',b'\x00\x00\xF4\x11fx_prev',b'\x00\x00\x22\x11n_no_improvement'),(b'\x00\x00\x04\x2A\x00\x00\x00\x02$4',b'\x00\x00\x08\x11x',b'\x00\x00\x08\x11xp',b'\x00\x00\x08\x11g',b'\x00\x00\x08\x11gp',b'\x00\x00\x08\x11d',b'\x00\x00\x08\x11pf',b'\x00\x00\x08\x11lmal',b'\x00\x00\x08\x11lmys',b'\x00\x00\x08\x11lms',b'\x00\x00\x08\x11lmy',b'\x00\x00\xF4\x11fx_best',b'\x00\x00\xF4\x11step',b'\x00\x00\x22\x11j',b'\x00\x00\x22\x11k',b'\x00\x00\x22\x11end',b'\x00\x00\x22\x11n_no_improvement'),(b'\x00\x00\x03\xF7\x00\x00\x00\x03$__mbstate_t',b'\x00\x03\xFF\x11__mbstate8',b'\x00\x00\xDB\x11_mbstateL'),(b'\x00\x00\x03\xF8\x00\x00\x00\x02$block_q2_K',b'\x00\x04\x44\x11scales',b'\x00\x04\x48\x11qs',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin'),(b'\x00\x00\x03\xF9\x00\x00\x00\x02$block_q3_K',b'\x00\x04\x46\x11hmask',b'\x00\x04\x48\x11qs',b'\x00\x04\x42\x11scales',b'\x00\x00\x6C\x11d'),(b'\x00\x00\x03\xFA\x00\x00\x00\x02$block_q4_K',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin',b'\x00\x04\x42\x11scales',b'\x00\x04\x40\x11qs'),(b'\x00\x00\x03\xFB\x00\x00\x00\x02$block_q5_K',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin',b'\x00\x04\x42\x11scales',b'\x00\x04\x46\x11qh',b'\x00\x04\x40\x11qs'),(b'\x00\x00\x03\xFC\x00\x00\x00\x02$block_q6_K',b'\x00\x04\x40\x11ql',b'\x00\x04\x48\x11qh',b'\x00\x04\x23\x11scales',b'\x00\x00\x6C\x11d'),(b'\x00\x00\x03\xFD\x00\x00\x00\x02$block_q8_K',b'\x00\x00\xF4\x11d',b'\x00\x04\x25\x11qs',b'\x00\x04\x21\x11bsums'),(b'\x00\x00\x04\x18\x00\x00\x00\x02$ggml_type_traits_t',b'\x00\x00\x0F\x11type_name',b'\x00\x00\x22\x11blck_size',b'\x00\x00\x11\x11type_size',b'\x00\x00\xB6\x11is_quantized',b'\x00\x04\x52\x11to_float',b'\x00\x04\x4F\x11from_float',b'\x00\x04\x4F\x11from_float_reference',b'\x00\x04\x50\x11vec_dot',b'\x00\x00\x01\x11vec_dot_type'),(b'\x00\x00\x04\x2C\x00\x00\x00\x02__darwin_pthread_handler_rec',b'\x00\x04\x51\x11__routine',b'\x00\x00\x10\x11__arg',b'\x00\x04\x2B\x11__next'),(b'\x00\x00\x03\xEF\x00\x00\x00\x02_opaque_pthread_attr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x0B\x11__opaque'),(b'\x00\x00\x03\xF0\x00\x00\x00\x02_opaque_pthread_cond_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x07\x11__opaque'),(b'\x00\x00\x03\xF1\x00\x00\x00\x02_opaque_pthread_condattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF2\x00\x00\x00\x02_opaque_pthread_mutex_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x0B\x11__opaque'),(b'\x00\x00\x03\xF3\x00\x00\x00\x02_opaque_pthread_mutexattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF4\x00\x00\x00\x02_opaque_pthread_once_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF5\x00\x00\x00\x02_opaque_pthread_rwlock_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x03\x11__opaque'),(b'\x00\x00\x03\xF6\x00\x00\x00\x02_opaque_pthread_rwlockattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x01\x11__opaque'),(b'\x00\x00\x04\x2E\x00\x00\x00\x02_opaque_pthread_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x2B\x11__cleanup_stack',b'\x00\x04\x0F\x11__opaque'),(b'\x00\x00\x04\x2F\x00\x00\x00\x10ggml_allocr',),(b'\x00\x00\x04\x30\x00\x00\x00\x02ggml_cgraph',b'\x00\x00\x22\x11n_nodes',b'\x00\x00\x22\x11n_leafs',b'\x00\x04\x39\x11nodes',b'\x00\x04\x39\x11grads',b'\x00\x04\x39\x11leafs',b'\x00\x04\x4D\x11visited_hash_table',b'\x00\x00\x22\x11perf_runs',b'\x00\x00\xDB\x11perf_cycles',b'\x00\x00\xDB\x11perf_time_us'),(b'\x00\x00\x04\x31\x00\x00\x00\x02ggml_compute_params',b'\x00\x04\x17\x11type',b'\x00\x00\x22\x11ith',b'\x00\x00\x22\x11nth',b'\x00\x00\x11\x11wsize',b'\x00\x00\x10\x11wdata'),(b'\x00\x00\x04\x32\x00\x00\x00\x10ggml_context',),(b'\x00\x00\x04\x33\x00\x00\x00\x02ggml_cplan',b'\x00\x00\x11\x11work_size',b'\x00\x04\x3F\x11work_data',b'\x00\x00\x22\x11n_threads',b'\x00\x04\x19\x11n_tasks',b'\x00\x03\xEE\x11abort_callback',b'\x00\x00\x10\x11abort_callback_data'),(b'\x00\x00\x00\xBC\x00\x00\x00\x02ggml_init_params',b'\x00\x00\x11\x11mem_size',b'\x00\x00\x10\x11mem_buffer',b'\x00\x00\xB6\x11no_alloc'),(b'\x00\x00\x04\x34\x00\x00\x00\x10ggml_metal_context',),(b'\x00\x00\x04\x35\x00\x00\x00\x10ggml_mpi_context',),(b'\x00\x00\x04\x37\x00\x00\x00\x02ggml_object',b'\x00\x00\x11\x11offs',b'\x00\x00\x11\x11size',b'\x00\x04\x36\x11next',b'\x00\x04\x15\x11type',b'\x00\x04\x09\x11padding'),(b'\x00\x00\x04\x38\x00\x00\x00\x02ggml_opt_context',b'\x00\x00\x0B\x11ctx',b'\x00\x00\x50\x11params',b'\x00\x00\x22\x11iter',b'\x00\x00\xDB\x11nx',b'\x00\x00\xB6\x11just_initialized',b'\x00\x04\x29\x11adam',b'\x00\x04\x2A\x11lbfgs'),(b'\x00\x00\x00\x50\x00\x00\x00\x02ggml_opt_params',b'\x00\x00\xC8\x11type',b'\x00\x00\x22\x11n_threads',b'\x00\x00\x22\x11past',b'\x00\x00\xF4\x11delta',b'\x00\x00\x22\x11max_no_improvement',b'\x00\x00\xB6\x11print_forward_graph',b'\x00\x00\xB6\x11print_backward_graph',b'\x00\x04\x27\x11adam',b'\x00\x04\x28\x11lbfgs'),(b'\x00\x00\x02\x48\x00\x00\x00\x02ggml_scratch',b'\x00\x00\x11\x11offs',b'\x00\x00\x11\x11size',b'\x00\x00\x10\x11data'),(b'\x00\x00\x04\x3D\x00\x00\x00\x02ggml_tensor',b'\x00\x00\x01\x11type',b'\x00\x04\x13\x11backend',b'\x00\x00\x22\x11n_dims',b'\x00\x04\x1E\x11ne',b'\x00\x04\x4B\x11nb',b'\x00\x00\x2E\x11op',b'\x00\x04\x1B\x11op_params',b'\x00\x00\xB6\x11is_param',b'\x00\x00\x08\x11grad',b'\x00\x04\x3B\x11src',b'\x00\x00\x22\x11perf_runs',b'\x00\x00\xDB\x11perf_cycles',b'\x00\x00\xDB\x11perf_time_us',b'\x00\x00\x10\x11data',b'\x00\x04\x0D\x11name',b'\x00\x00\x10\x11extra',b'\x00\x04\x09\x11padding'),(b'\x00\x00\x04\x3E\x00\x00\x00\x10gguf_context',),(b'\x00\x00\x02\x24\x00\x00\x00\x02gguf_init_params',b'\x00\x00\xB6\x11no_alloc',b'\x00\x00\xB0\x11ctx')),
+    _enums = (b'\x00\x00\x04\x13\x00\x00\x00\x16ggml_backend\x00GGML_BACKEND_CPU,GGML_BACKEND_GPU,GGML_BACKEND_GPU_SPLIT',b'\x00\x00\x00\x54\x00\x00\x00\x15ggml_ftype\x00GGML_FTYPE_UNKNOWN,GGML_FTYPE_ALL_F32,GGML_FTYPE_MOSTLY_F16,GGML_FTYPE_MOSTLY_Q4_0,GGML_FTYPE_MOSTLY_Q4_1,GGML_FTYPE_MOSTLY_Q4_1_SOME_F16,GGML_FTYPE_MOSTLY_Q8_0,GGML_FTYPE_MOSTLY_Q5_0,GGML_FTYPE_MOSTLY_Q5_1,GGML_FTYPE_MOSTLY_Q2_K,GGML_FTYPE_MOSTLY_Q3_K,GGML_FTYPE_MOSTLY_Q4_K,GGML_FTYPE_MOSTLY_Q5_K,GGML_FTYPE_MOSTLY_Q6_K',b'\x00\x00\x04\x14\x00\x00\x00\x16ggml_linesearch\x00GGML_LINESEARCH_DEFAULT,GGML_LINESEARCH_BACKTRACKING_ARMIJO,GGML_LINESEARCH_BACKTRACKING_WOLFE,GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE',b'\x00\x00\x04\x15\x00\x00\x00\x16ggml_object_type\x00GGML_OBJECT_TENSOR,GGML_OBJECT_GRAPH,GGML_OBJECT_WORK_BUFFER',b'\x00\x00\x00\x2E\x00\x00\x00\x16ggml_op\x00GGML_OP_NONE,GGML_OP_DUP,GGML_OP_ADD,GGML_OP_ADD1,GGML_OP_ACC,GGML_OP_SUB,GGML_OP_MUL,GGML_OP_DIV,GGML_OP_SQR,GGML_OP_SQRT,GGML_OP_LOG,GGML_OP_SUM,GGML_OP_SUM_ROWS,GGML_OP_MEAN,GGML_OP_ARGMAX,GGML_OP_REPEAT,GGML_OP_REPEAT_BACK,GGML_OP_SILU_BACK,GGML_OP_NORM,GGML_OP_RMS_NORM,GGML_OP_RMS_NORM_BACK,GGML_OP_MUL_MAT,GGML_OP_OUT_PROD,GGML_OP_SCALE,GGML_OP_SET,GGML_OP_CPY,GGML_OP_CONT,GGML_OP_RESHAPE,GGML_OP_VIEW,GGML_OP_PERMUTE,GGML_OP_TRANSPOSE,GGML_OP_GET_ROWS,GGML_OP_GET_ROWS_BACK,GGML_OP_DIAG,GGML_OP_DIAG_MASK_INF,GGML_OP_DIAG_MASK_ZERO,GGML_OP_SOFT_MAX,GGML_OP_SOFT_MAX_BACK,GGML_OP_ROPE,GGML_OP_ROPE_BACK,GGML_OP_ALIBI,GGML_OP_CLAMP,GGML_OP_CONV_1D,GGML_OP_CONV_2D,GGML_OP_POOL_1D,GGML_OP_POOL_2D,GGML_OP_FLASH_ATTN,GGML_OP_FLASH_FF,GGML_OP_FLASH_ATTN_BACK,GGML_OP_WIN_PART,GGML_OP_WIN_UNPART,GGML_OP_UNARY,GGML_OP_MAP_UNARY,GGML_OP_MAP_BINARY,GGML_OP_MAP_CUSTOM1_F32,GGML_OP_MAP_CUSTOM2_F32,GGML_OP_MAP_CUSTOM3_F32,GGML_OP_MAP_CUSTOM1,GGML_OP_MAP_CUSTOM2,GGML_OP_MAP_CUSTOM3,GGML_OP_CROSS_ENTROPY_LOSS,GGML_OP_CROSS_ENTROPY_LOSS_BACK,GGML_OP_COUNT',b'\x00\x00\x01\x01\x00\x00\x00\x16ggml_op_pool\x00GGML_OP_POOL_MAX,GGML_OP_POOL_AVG,GGML_OP_POOL_COUNT',b'\x00\x00\x04\x16\x00\x00\x00\x15ggml_opt_result\x00GGML_OPT_OK,GGML_OPT_DID_NOT_CONVERGE,GGML_OPT_NO_CONTEXT,GGML_OPT_INVALID_WOLFE,GGML_OPT_FAIL,GGML_LINESEARCH_FAIL,GGML_LINESEARCH_MINIMUM_STEP,GGML_LINESEARCH_MAXIMUM_STEP,GGML_LINESEARCH_MAXIMUM_ITERATIONS,GGML_LINESEARCH_INVALID_PARAMETERS',b'\x00\x00\x00\xC8\x00\x00\x00\x16ggml_opt_type\x00GGML_OPT_ADAM,GGML_OPT_LBFGS',b'\x00\x00\x04\x17\x00\x00\x00\x16ggml_task_type\x00GGML_TASK_INIT,GGML_TASK_COMPUTE,GGML_TASK_FINALIZE',b'\x00\x00\x00\x01\x00\x00\x00\x16ggml_type\x00GGML_TYPE_F32,GGML_TYPE_F16,GGML_TYPE_Q4_0,GGML_TYPE_Q4_1,GGML_TYPE_Q5_0,GGML_TYPE_Q5_1,GGML_TYPE_Q8_0,GGML_TYPE_Q8_1,GGML_TYPE_Q2_K,GGML_TYPE_Q3_K,GGML_TYPE_Q4_K,GGML_TYPE_Q5_K,GGML_TYPE_Q6_K,GGML_TYPE_Q8_K,GGML_TYPE_I8,GGML_TYPE_I16,GGML_TYPE_I32,GGML_TYPE_COUNT',b'\x00\x00\x01\x14\x00\x00\x00\x16ggml_unary_op\x00GGML_UNARY_OP_ABS,GGML_UNARY_OP_SGN,GGML_UNARY_OP_NEG,GGML_UNARY_OP_STEP,GGML_UNARY_OP_TANH,GGML_UNARY_OP_ELU,GGML_UNARY_OP_RELU,GGML_UNARY_OP_GELU,GGML_UNARY_OP_GELU_QUICK,GGML_UNARY_OP_SILU',b'\x00\x00\x00\x34\x00\x00\x00\x16gguf_type\x00GGUF_TYPE_UINT8,GGUF_TYPE_INT8,GGUF_TYPE_UINT16,GGUF_TYPE_INT16,GGUF_TYPE_UINT32,GGUF_TYPE_INT32,GGUF_TYPE_FLOAT32,GGUF_TYPE_BOOL,GGUF_TYPE_STRING,GGUF_TYPE_ARRAY,GGUF_TYPE_COUNT'),
+    _typenames = (b'\x00\x00\x00\xDB__darwin_blkcnt_t',b'\x00\x00\x00\x22__darwin_blksize_t',b'\x00\x00\x00\x11__darwin_clock_t',b'\x00\x00\x00\x22__darwin_ct_rune_t',b'\x00\x00\x00\x22__darwin_dev_t',b'\x00\x00\x03\xBF__darwin_fsblkcnt_t',b'\x00\x00\x03\xBF__darwin_fsfilcnt_t',b'\x00\x00\x03\xBF__darwin_gid_t',b'\x00\x00\x03\xBF__darwin_id_t',b'\x00\x00\x04\x4A__darwin_ino64_t',b'\x00\x00\x04\x4A__darwin_ino_t',b'\x00\x00\x04\x20__darwin_intptr_t',b'\x00\x00\x03\xBF__darwin_mach_port_name_t',b'\x00\x00\x03\xBF__darwin_mach_port_t',b'\x00\x00\x03\xF7__darwin_mbstate_t',b'\x00\x00\x00\x6C__darwin_mode_t',b'\x00\x00\x03\xBF__darwin_natural_t',b'\x00\x00\x00\xDB__darwin_off_t',b'\x00\x00\x00\x22__darwin_pid_t',b'\x00\x00\x03\xEF__darwin_pthread_attr_t',b'\x00\x00\x03\xF0__darwin_pthread_cond_t',b'\x00\x00\x03\xF1__darwin_pthread_condattr_t',b'\x00\x00\x00\x11__darwin_pthread_key_t',b'\x00\x00\x03\xF2__darwin_pthread_mutex_t',b'\x00\x00\x03\xF3__darwin_pthread_mutexattr_t',b'\x00\x00\x03\xF4__darwin_pthread_once_t',b'\x00\x00\x03\xF5__darwin_pthread_rwlock_t',b'\x00\x00\x03\xF6__darwin_pthread_rwlockattr_t',b'\x00\x00\x04\x2D__darwin_pthread_t',b'\x00\x00\x04\x20__darwin_ptrdiff_t',b'\x00\x00\x00\x22__darwin_rune_t',b'\x00\x00\x03\xBF__darwin_sigset_t',b'\x00\x00\x00\x11__darwin_size_t',b'\x00\x00\x03\xBF__darwin_socklen_t',b'\x00\x00\x04\x20__darwin_ssize_t',b'\x00\x00\x00\x22__darwin_suseconds_t',b'\x00\x00\x04\x20__darwin_time_t',b'\x00\x00\x03\xBF__darwin_uid_t',b'\x00\x00\x03\xBF__darwin_useconds_t',b'\x00\x00\x04\x05__darwin_uuid_string_t',b'\x00\x00\x04\x44__darwin_uuid_t',b'\x00\x00\x00\x22__darwin_wchar_t',b'\x00\x00\x00\x22__darwin_wint_t',b'\x00\x00\x03\xB0__int16_t',b'\x00\x00\x00\x22__int32_t',b'\x00\x00\x00\xDB__int64_t',b'\x00\x00\x03\xB5__int8_t',b'\x00\x00\x03\xF7__mbstate_t',b'\x00\x00\x00\x6C__uint16_t',b'\x00\x00\x03\xBF__uint32_t',b'\x00\x00\x04\x4A__uint64_t',b'\x00\x00\x03\xBA__uint8_t',b'\x00\x00\x03\xF8block_q2_K',b'\x00\x00\x03\xF9block_q3_K',b'\x00\x00\x03\xFAblock_q4_K',b'\x00\x00\x03\xFBblock_q5_K',b'\x00\x00\x03\xFCblock_q6_K',b'\x00\x00\x03\xFDblock_q8_K',b'\x00\x00\x01\xEAggml_binary_op_f32_t',b'\x00\x00\x02\x02ggml_custom1_op_f32_t',b'\x00\x00\x02\x07ggml_custom1_op_t',b'\x00\x00\x01\xF0ggml_custom2_op_f32_t',b'\x00\x00\x01\xF6ggml_custom2_op_t',b'\x00\x00\x01\xC5ggml_custom3_op_f32_t',b'\x00\x00\x01\xCCggml_custom3_op_t',b'\x00\x00\x00\x6Cggml_fp16_t',b'\x00\x00\x04\x4Fggml_from_float_t',b'\x00\x00\x04\x52ggml_to_float_t',b'\x00\x00\x04\x18ggml_type_traits_t',b'\x00\x00\x01\xFDggml_unary_op_f32_t',b'\x00\x00\x04\x50ggml_vec_dot_t',b'\x00\x00\x03\xB0int16_t',b'\x00\x00\x00\x22int32_t',b'\x00\x00\x00\xDBint64_t',b'\x00\x00\x03\xB5int8_t',b'\x00\x00\x03\xB0int_fast16_t',b'\x00\x00\x00\x22int_fast32_t',b'\x00\x00\x00\xDBint_fast64_t',b'\x00\x00\x03\xB5int_fast8_t',b'\x00\x00\x03\xB0int_least16_t',b'\x00\x00\x00\x22int_least32_t',b'\x00\x00\x00\xDBint_least64_t',b'\x00\x00\x03\xB5int_least8_t',b'\x00\x00\x04\x20intmax_t',b'\x00\x00\x04\x20intptr_t',b'\x00\x00\x04\x1Dmax_align_t',b'\x00\x00\x04\x20ptrdiff_t',b'\x00\x00\x00\xDBregister_t',b'\x00\x00\x00\x11rsize_t',b'\x00\x00\x00\x11size_t',b'\x00\x00\x04\x4Asyscall_arg_t',b'\x00\x00\x00\x6Cu_int16_t',b'\x00\x00\x03\xBFu_int32_t',b'\x00\x00\x04\x4Au_int64_t',b'\x00\x00\x03\xBAu_int8_t',b'\x00\x00\x00\x6Cuint16_t',b'\x00\x00\x03\xBFuint32_t',b'\x00\x00\x04\x4Auint64_t',b'\x00\x00\x03\xBAuint8_t',b'\x00\x00\x00\x6Cuint_fast16_t',b'\x00\x00\x03\xBFuint_fast32_t',b'\x00\x00\x04\x4Auint_fast64_t',b'\x00\x00\x03\xBAuint_fast8_t',b'\x00\x00\x00\x6Cuint_least16_t',b'\x00\x00\x03\xBFuint_least32_t',b'\x00\x00\x04\x4Auint_least64_t',b'\x00\x00\x03\xBAuint_least8_t',b'\x00\x00\x00\x11uintmax_t',b'\x00\x00\x00\x11uintptr_t',b'\x00\x00\x04\x4Auser_addr_t',b'\x00\x00\x00\xDBuser_long_t',b'\x00\x00\x00\xDBuser_off_t',b'\x00\x00\x04\x4Auser_size_t',b'\x00\x00\x00\xDBuser_ssize_t',b'\x00\x00\x00\xDBuser_time_t',b'\x00\x00\x04\x4Auser_ulong_t',b'\x00\x00\x00\x22wchar_t'),
+)
diff --git a/stable-diffusion.cpp/ggml/examples/python/ggml/ffi/__init__.pyi b/stable-diffusion.cpp/ggml/examples/python/ggml/ffi/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..73117a1c6b9c2bc71e839c1b66e193c1bce4ecde
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/ggml/ffi/__init__.pyi
@@ -0,0 +1,7 @@
+# Phony stubs.
+
+class CData:
+    pass
+
+class CType:
+    pass
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/examples/python/ggml/utils.py b/stable-diffusion.cpp/ggml/examples/python/ggml/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cea2bf71b423f0040d7a1ed1ac601628771520e
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/ggml/utils.py
@@ -0,0 +1,182 @@
+"""
+  Common helpers for working with ggml + numpy
+"""
+from ggml import ffi, lib
+from typing import Union, Optional
+import numpy as np
+
+def init(mem_size: int, mem_buffer: ffi.CData = ffi.NULL, no_alloc: bool = False) -> ffi.CData:
+    """
+      Initialize a ggml context, which will be freed automatically when the pointer is garbage collected.
+    """
+    params = ffi.new('struct ggml_init_params*')
+    params.mem_size = mem_size
+    params.mem_buffer = mem_buffer
+    params.no_alloc = no_alloc
+    return ffi.gc(lib.ggml_init(params[0]), lib.ggml_free)
+ 
+TensorLike = Union[ffi.CData, np.ndarray]
+
+def copy(from_tensor: TensorLike, to_tensor: TensorLike, allow_requantize: bool = True):
+    """
+      Copy the contents of one tensor to another, doing any necessary (de/re)quantization transparently.
+      Works across numpy & ggml tensors, but they must have the same shape (and be contiguous).
+
+      Parameters
+      ----------
+      from_tensor : TensorLike
+          The tensor to copy from (a numpy array or possibly-quantized ggml tensor)
+      to_tensor : TensorLike
+          The tensor to copy to (a numpy array or possibly-quantized ggml tensor)
+      allow_requantize : bool
+          If False, will throw an error if requantization is required (i.e. both from_tensor
+          and to_tensor are quantized with different quantization types)
+    """
+    if id(from_tensor) == id(to_tensor):
+        return
+ 
+    __expect_same_layout("source", from_tensor, "destination", to_tensor)
+    __check_shape_consistent_with_type(from_tensor)
+    __check_shape_consistent_with_type(to_tensor)
+
+    from_type = __get_type(from_tensor)
+    to_type = __get_type(to_tensor)
+
+    if from_type == to_type:
+        ffi.memmove(__get_data(to_tensor), __get_data(from_tensor), __get_nbytes(from_tensor))
+    else:
+        assert allow_requantize or not lib.ggml_is_quantized(from_type) or not lib.ggml_is_quantized(to_type), \
+            f"Requantizing from {__type_name(from_type)} to {__type_name(to_type)} is disabled. Force with allow_requantize=True"
+ 
+        __set_floats(to_tensor, __get_floats(from_tensor))
+
+def numpy(tensor: ffi.CData, allow_copy: Union[bool, np.ndarray] = False, allow_requantize=False) -> np.ndarray:
+    """
+      Convert a ggml tensor to a numpy array.
+      If the tensor isn't quantized, the returned numpy array will be a view over its data.
+ 
+      If it is quantized (and allow_copy is True), the copy will involve dequantization and the returned array will
+      be a copy of the original tensor (any changes to the numpy array won't then be reflected back to the tensor).
+
+      Parameters
+      ----------
+      tensor : ffi.CData
+          The tensor to convert to a numpy array
+      allow_copy : bool or np.ndarray
+          If False, will throw an error if the tensor is quantized (since dequantization requires extra memory).
+          If True, will dequantize the tensor and return a copy of the data in a new float32 numpy array.
+          If an np.ndarray, will copy the data into the given array (which must be the same shape as the tensor) when dequantization is needed
+      allow_requantize : bool
+          If allow_copy is a tensor with a different quantization type than the source tensor, will throw an error unless allow_requantize is True.
+    """
+    shape = __get_shape(tensor)
+
+    if lib.ggml_is_quantized(tensor.type):
+        if allow_copy == False:
+            raise ValueError(f"{__describe(tensor)} is quantized, conversion to numpy requires a copy (pass allow_copy=True; changes to the numpy array won't affect the original).")
+        elif isinstance(allow_copy, np.ndarray):
+            __expect_same_layout("source tensor", tensor, "dequantization output tensor", allow_copy)
+            destination = allow_copy
+        else:
+            destination = np.empty(shape, dtype=np.float32)
+
+        copy(tensor, destination, allow_requantize=allow_requantize)
+        return destination
+    else:
+        dtype = __type_to_dtype(tensor.type)
+        if not dtype:
+            raise NotImplementedError(f'Cannot convert {__describe(tensor)} to numpy')
+
+        assert __is_contiguous(tensor), f"Cannot convert {__describe(tensor)} to numpy (support contiguous tensors only)"
+        nbytes = lib.ggml_nelements(tensor) * lib.ggml_type_size(tensor.type)
+        array = np.frombuffer(ffi.buffer(lib.ggml_get_data(tensor), nbytes), dtype=dtype)
+        array.shape = shape
+        return array
+
+def __type_name(type: int) -> str:
+    name = lib.ggml_type_name(type)
+    return ffi.string(name).decode('utf-8') if name else None
+
+__k_quant_types = set([
+  lib.GGML_TYPE_Q2_K,
+  lib.GGML_TYPE_Q3_K,
+  lib.GGML_TYPE_Q4_K,
+  lib.GGML_TYPE_Q5_K,
+  lib.GGML_TYPE_Q6_K,
+  lib.GGML_TYPE_Q8_K,
+])
+
+__type_to_dtype_dict = {
+  lib.GGML_TYPE_I8: np.int8,
+  lib.GGML_TYPE_I16: np.int16,
+  lib.GGML_TYPE_I32: np.int32,
+  lib.GGML_TYPE_F16: np.float16,
+  lib.GGML_TYPE_F32: np.float32,
+}
+
+def __type_to_dtype(type: int) -> Optional[np.dtype]: return __type_to_dtype_dict.get(type)
+def __dtype_to_type(dtype: np.dtype):
+    if dtype == np.float32: return lib.GGML_TYPE_F32
+    elif dtype == np.float16: return lib.GGML_TYPE_F16
+    elif dtype == np.int32: return lib.GGML_TYPE_I32
+    elif dtype == np.int16: return lib.GGML_TYPE_I16
+    elif dtype == np.int8: return lib.GGML_TYPE_I8
+    else: raise ValueError(f"Unsupported dtype: {dtype}")
+
+def __describe(tensor: ffi.CType): return f'Tensor[{__type_name(__get_type(tensor))}, {__get_shape(tensor)}]'
+def __get_type(tensor: TensorLike): return __dtype_to_type(tensor.dtype) if isinstance(tensor, np.ndarray) else tensor.type
+def __get_shape(x: TensorLike): return x.shape if isinstance(x, np.ndarray) else tuple([x.ne[i] for i in range(x.n_dims)])
+def __get_strides(x: TensorLike): return x.strides if isinstance(x, np.ndarray) else tuple([x.nb[i] for i in range(x.n_dims)])
+def __get_data(x: TensorLike) -> ffi.CData: return ffi.from_buffer(x) if isinstance(x, np.ndarray) else lib.ggml_get_data(x)
+def __get_nbytes(tensor: TensorLike): return tensor.nbytes if isinstance(tensor, np.ndarray) else lib.ggml_nbytes(tensor)
+def __get_nelements(tensor: TensorLike): return tensor.size if isinstance(tensor, np.ndarray) else lib.ggml_nelements(tensor)
+def __is_contiguous(tensor: TensorLike): return tensor.flags['C_CONTIGUOUS'] if isinstance(tensor, np.ndarray) else lib.ggml_is_contiguous(tensor)
+
+def __get_floats(tensor: TensorLike) -> ffi.CData:
+    data, type = __get_data(tensor), __get_type(tensor)
+    if type == lib.GGML_TYPE_F32:
+        return ffi.cast('float*', data)
+    else:
+      nelements = __get_nelements(tensor)
+      floats = ffi.new('float[]', nelements)
+      if type == lib.GGML_TYPE_F16:
+          lib.ggml_fp16_to_fp32_row(ffi.cast('uint16_t*', data), floats, nelements)
+      elif lib.ggml_is_quantized(type):
+          qtype = lib.ggml_internal_get_type_traits(type)
+          assert qtype.to_float, f"Type {__type_name(type)} is not supported by ggml"
+          qtype.to_float(data, floats, nelements)
+      else:
+          raise NotImplementedError(f'Cannot read floats from {__describe(tensor)}')
+      return floats
+
+def __set_floats(tensor: TensorLike, f32_data: ffi.CData) -> None:
+    data, type, nbytes = __get_data(tensor), __get_type(tensor), __get_nbytes(tensor)
+    if type == lib.GGML_TYPE_F32:
+        ffi.memmove(data, f32_data, nbytes)
+    else:
+      nelements = __get_nelements(tensor)
+      if type == lib.GGML_TYPE_F16:
+          lib.ggml_fp32_to_fp16_row(f32_data, ffi.cast('uint16_t*', data), nelements)
+      elif lib.ggml_is_quantized(type):
+          qtype = lib.ggml_internal_get_type_traits(type)
+          assert qtype.from_float, f"Type {__type_name(type)} is not supported by ggml"
+          qtype.from_float(f32_data, data, nelements)
+      else:
+          raise NotImplementedError(f'Cannot write floats to {__describe(tensor)}')
+
+def __expect_same_layout(name1: str, tensor1: TensorLike, name2: str, tensor2: TensorLike):
+    shape1, shape2 = __get_shape(tensor1), __get_shape(tensor2)
+    assert shape1 == shape2, f"Shape mismatch: {name1} has {shape1} but {name2} has {shape2}"
+    assert __is_contiguous(tensor1) and __is_contiguous(tensor2), f"Only contiguous tensors are supported (got {name1} with strides {__get_strides(tensor1)} and {name2} with strides {__get_strides(tensor2)})"
+
+def __check_shape_consistent_with_type(tensor: TensorLike):
+    type = __get_type(tensor)
+    if not lib.ggml_is_quantized(type):
+        return
+    shape = __get_shape(tensor)
+
+    block_size = lib.ggml_blck_size(type)
+    assert not (block_size == 0 and type in __k_quant_types), f"Can't quantize, native library was not compiled with USE_K_QUANTS!"
+    assert block_size > 0, f"Invalid block size {block_size} for type {__type_name(type)}"
+    for i, d in enumerate(shape):
+        assert d % block_size == 0, f"Dimension {i} of {__describe(tensor)} is not divisible by {block_size}, required for quantization."
diff --git a/stable-diffusion.cpp/ggml/examples/python/regenerate.py b/stable-diffusion.cpp/ggml/examples/python/regenerate.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d84c0367e582b0e6149234ac325079cb46447f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/regenerate.py
@@ -0,0 +1,42 @@
+# Generates bindings for the ggml library.
+#
+# cffi requires prior C preprocessing of the headers, and it uses pycparser which chokes on a couple of things
+# so we help it a bit (e.g. replace sizeof expressions with their value, remove exotic syntax found in Darwin headers).
+import os, sys, re, subprocess
+import cffi
+from stubs import generate_stubs
+
+API = os.environ.get('API', 'api.h')
+CC = os.environ.get('CC') or 'gcc'
+C_INCLUDE_DIR = os.environ.get('C_INCLUDE_DIR', '../../../llama.cpp')
+CPPFLAGS = [
+    "-I", C_INCLUDE_DIR,
+    '-D__fp16=uint16_t',  # pycparser doesn't support __fp16
+    '-D__attribute__(x)=',
+    '-D_Static_assert(x, m)=',
+] + [x for x in os.environ.get('CPPFLAGS', '').split(' ') if x != '']
+
+try: header = subprocess.run([CC, "-E", *CPPFLAGS, API], capture_output=True, text=True, check=True).stdout
+except subprocess.CalledProcessError as e: print(f'{e.stderr}\n{e}', file=sys.stderr); raise
+
+header = '\n'.join([l for l in header.split('\n') if '__darwin_va_list' not in l]) # pycparser hates this
+
+# Replace constant size expressions w/ their value (compile & run a mini exe for each, because why not).
+# First, extract anyting *inside* square brackets and anything that looks like a sizeof call.
+for expr in set(re.findall(f'(?<=\\[)[^\\]]+(?=])|sizeof\\s*\\([^()]+\\)', header)):
+    if re.match(r'^(\d+|\s*)$', expr): continue # skip constants and empty bracket contents
+    subprocess.run([CC, "-o", "eval_size_expr", *CPPFLAGS, "-x", "c", "-"], text=True, check=True,
+                   input=f'''#include <stdio.h>
+                             #include "{API}"
+                             int main() {{ printf("%lu", (size_t)({expr})); }}''')
+    size = subprocess.run(["./eval_size_expr"], capture_output=True, text=True, check=True).stdout
+    print(f'Computed constexpr {expr} = {size}')
+    header = header.replace(expr, size)
+
+ffibuilder = cffi.FFI()
+ffibuilder.cdef(header)
+ffibuilder.set_source(f'ggml.cffi', None) # we're not compiling a native extension, as this quickly gets hairy
+ffibuilder.compile(verbose=True)
+
+with open("ggml/__init__.pyi", "wt") as f:
+    f.write(generate_stubs(header))
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/examples/python/stubs.py b/stable-diffusion.cpp/ggml/examples/python/stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf3d6c57aa3d19495e33e8e79772a4eda1e4902
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/stubs.py
@@ -0,0 +1,128 @@
+"""
+  This generates .pyi stubs for the cffi Python bindings generated by regenerate.py
+"""
+import sys, re, itertools
+sys.path.extend(['.', '..']) # for pycparser
+
+from pycparser import c_ast, parse_file, CParser
+import pycparser.plyparser
+from pycparser.c_ast import PtrDecl, TypeDecl, FuncDecl, EllipsisParam, IdentifierType, Struct, Enum, Typedef
+from typing import Tuple
+
+__c_type_to_python_type = {
+    'void': 'None', '_Bool': 'bool',
+    'char': 'int', 'short': 'int', 'int': 'int', 'long': 'int',
+    'ptrdiff_t': 'int', 'size_t': 'int',
+    'int8_t': 'int', 'uint8_t': 'int',
+    'int16_t': 'int', 'uint16_t': 'int',
+    'int32_t': 'int', 'uint32_t': 'int',
+    'int64_t': 'int', 'uint64_t': 'int',
+    'float': 'float', 'double': 'float',
+    'ggml_fp16_t': 'np.float16',
+}
+
+def format_type(t: TypeDecl):
+    if isinstance(t, PtrDecl) or isinstance(t, Struct):
+        return 'ffi.CData'
+    if isinstance(t, Enum):
+        return 'int'
+    if isinstance(t, TypeDecl):
+        return format_type(t.type)
+    if isinstance(t, IdentifierType):
+        assert len(t.names) == 1, f'Expected a single name, got {t.names}'
+        return __c_type_to_python_type.get(t.names[0]) or 'ffi.CData'
+    return t.name
+
+class PythonStubFuncDeclVisitor(c_ast.NodeVisitor):
+    def __init__(self):
+        self.sigs = {}
+        self.sources = {}
+
+    def get_source_snippet_lines(self, coord: pycparser.plyparser.Coord) -> Tuple[list[str], list[str]]:
+        if coord.file not in self.sources:
+            with open(coord.file, 'rt') as f:
+                self.sources[coord.file] = f.readlines()
+        source_lines = self.sources[coord.file]
+        ncomment_lines = len(list(itertools.takewhile(lambda i: re.search(r'^\s*(//|/\*)', source_lines[i]), range(coord.line - 2, -1, -1))))
+        comment_lines = [l.strip() for l in source_lines[coord.line - 1 - ncomment_lines:coord.line - 1]]
+        decl_lines = []
+        for line in source_lines[coord.line - 1:]:
+            decl_lines.append(line.rstrip())
+            if (';' in line) or ('{' in line): break
+        return (comment_lines, decl_lines)
+
+    def visit_Enum(self, node: Enum):
+        if node.values is not None:
+          for e in node.values.enumerators:
+              self.sigs[e.name] = f'  @property\n  def {e.name}(self) -> int: ...'
+
+    def visit_Typedef(self, node: Typedef):
+        pass
+
+    def visit_FuncDecl(self, node: FuncDecl):
+        ret_type = node.type
+        is_ptr = False
+        while isinstance(ret_type, PtrDecl):
+            ret_type = ret_type.type
+            is_ptr = True
+
+        fun_name = ret_type.declname
+        if fun_name.startswith('__'):
+            return
+
+        args = []
+        argnames = []
+        def gen_name(stem):
+            i = 1
+            while True:
+                new_name = stem if i == 1 else f'{stem}{i}'
+                if new_name not in argnames: return new_name
+                i += 1
+
+        for a in node.args.params:
+            if isinstance(a, EllipsisParam):
+                arg_name = gen_name('args')
+                argnames.append(arg_name)
+                args.append('*' + gen_name('args'))
+            elif format_type(a.type) == 'None':
+                continue
+            else:
+                arg_name = a.name or gen_name('arg')
+                argnames.append(arg_name)
+                args.append(f'{arg_name}: {format_type(a.type)}')
+
+        ret = format_type(ret_type if not is_ptr else node.type)
+
+        comment_lines, decl_lines = self.get_source_snippet_lines(node.coord)
+
+        lines = [f'  def {fun_name}({", ".join(args)}) -> {ret}:']
+        if len(comment_lines) == 0 and len(decl_lines) == 1:
+            lines += [f'    """{decl_lines[0]}"""']
+        else:
+            lines += ['    """']
+            lines += [f'    {c.lstrip("/* ")}' for c in comment_lines]
+            if len(comment_lines) > 0:
+                lines += ['']
+            lines += [f'    {d}' for d in decl_lines]
+            lines += ['    """']
+        lines += ['    ...']
+        self.sigs[fun_name] = '\n'.join(lines)
+
+def generate_stubs(header: str):
+    """
+      Generates a .pyi Python stub file for the GGML API using C header files.
+    """
+
+    v = PythonStubFuncDeclVisitor()
+    v.visit(CParser().parse(header, "<input>"))
+
+    keys = list(v.sigs.keys())
+    keys.sort()
+
+    return '\n'.join([
+        '# auto-generated file',
+        'import ggml.ffi as ffi',
+        'import numpy as np',
+        'class lib:',
+        *[v.sigs[k] for k in keys]
+    ])
diff --git a/stable-diffusion.cpp/ggml/examples/python/test_tensor.py b/stable-diffusion.cpp/ggml/examples/python/test_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a365fae3a6287e15172a4003d2d992f245e4904
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/python/test_tensor.py
@@ -0,0 +1,258 @@
+import pytest
+from pytest import raises
+
+from ggml import lib, ffi
+from ggml.utils import init, copy, numpy
+import numpy as np
+import numpy.testing as npt
+
+@pytest.fixture()
+def ctx():
+    print("setup")
+    yield init(mem_size=10*1024*1024)
+    print("teardown")
+
+class TestNumPy:
+    
+    # Single element
+
+    def test_set_get_single_i32(self, ctx):
+        i = lib.ggml_new_i32(ctx, 42)
+        assert lib.ggml_get_i32_1d(i, 0) == 42
+        assert numpy(i) == np.array([42], dtype=np.int32)
+
+    def test_set_get_single_f32(self, ctx):
+        i = lib.ggml_new_f32(ctx, 4.2)
+        
+        epsilon = 0.000001 # Not sure why so large a difference??
+        pytest.approx(lib.ggml_get_f32_1d(i, 0), 4.2, epsilon)
+        pytest.approx(numpy(i), np.array([4.2], dtype=np.float32), epsilon)
+
+    def _test_copy_np_to_ggml(self, a: np.ndarray, t: ffi.CData):
+        a2 = a.copy() # Clone original
+        copy(a, t)
+        npt.assert_array_equal(numpy(t), a2)
+
+    # I32
+
+    def test_copy_np_to_ggml_1d_i32(self, ctx):
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_I32, 10)
+        a = np.arange(10, dtype=np.int32)
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_2d_i32(self, ctx):
+        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_I32, 2, 3)
+        a = np.arange(2 * 3, dtype=np.int32).reshape((2, 3))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_3d_i32(self, ctx):
+        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_I32, 2, 3, 4)
+        a = np.arange(2 * 3 * 4, dtype=np.int32).reshape((2, 3, 4))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_4d_i32(self, ctx):
+        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_I32, 2, 3, 4, 5)
+        a = np.arange(2 * 3 * 4 * 5, dtype=np.int32).reshape((2, 3, 4, 5))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_4d_n_i32(self, ctx):
+        dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash
+        pdims = ffi.new('int64_t[]', len(dims))
+        for i, d in enumerate(dims): pdims[i] = d
+        t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_I32, len(dims), pdims)
+        a = np.arange(np.prod(dims), dtype=np.int32).reshape(tuple(pdims))
+        self._test_copy_np_to_ggml(a, t)
+
+    # F32
+
+    def test_copy_np_to_ggml_1d_f32(self, ctx):
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
+        a = np.arange(10, dtype=np.float32)
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_2d_f32(self, ctx):
+        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, 2, 3)
+        a = np.arange(2 * 3, dtype=np.float32).reshape((2, 3))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_3d_f32(self, ctx):
+        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F32, 2, 3, 4)
+        a = np.arange(2 * 3 * 4, dtype=np.float32).reshape((2, 3, 4))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_4d_f32(self, ctx):
+        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F32, 2, 3, 4, 5)
+        a = np.arange(2 * 3 * 4 * 5, dtype=np.float32).reshape((2, 3, 4, 5))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_4d_n_f32(self, ctx):
+        dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash
+        pdims = ffi.new('int64_t[]', len(dims))
+        for i, d in enumerate(dims): pdims[i] = d
+        t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_F32, len(dims), pdims)
+        a = np.arange(np.prod(dims), dtype=np.float32).reshape(tuple(pdims))
+        self._test_copy_np_to_ggml(a, t)
+
+    # F16
+
+    def test_copy_np_to_ggml_1d_f16(self, ctx):
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F16, 10)
+        a = np.arange(10, dtype=np.float16)
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_2d_f16(self, ctx):
+        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F16, 2, 3)
+        a = np.arange(2 * 3, dtype=np.float16).reshape((2, 3))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_3d_f16(self, ctx):
+        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F16, 2, 3, 4)
+        a = np.arange(2 * 3 * 4, dtype=np.float16).reshape((2, 3, 4))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_4d_f16(self, ctx):
+        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F16, 2, 3, 4, 5)
+        a = np.arange(2 * 3 * 4 * 5, dtype=np.float16).reshape((2, 3, 4, 5))
+        self._test_copy_np_to_ggml(a, t)
+
+    def test_copy_np_to_ggml_4d_n_f16(self, ctx):
+        dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash
+        pdims = ffi.new('int64_t[]', len(dims))
+        for i, d in enumerate(dims): pdims[i] = d
+        t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_F16, len(dims), pdims)
+        a = np.arange(np.prod(dims), dtype=np.float16).reshape(tuple(pdims))
+        self._test_copy_np_to_ggml(a, t)
+
+    # Mismatching shapes
+
+    def test_copy_mismatching_shapes_1d(self, ctx):
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
+        a = np.arange(10, dtype=np.float32)
+        copy(a, t) # OK
+        
+        a = a.reshape((5, 2))
+        with raises(AssertionError): copy(a, t)
+        with raises(AssertionError): copy(t, a)
+            
+    def test_copy_mismatching_shapes_2d(self, ctx):
+        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, 2, 3)
+        a = np.arange(6, dtype=np.float32)
+        copy(a.reshape((2, 3)), t) # OK
+        
+        a = a.reshape((3, 2))
+        with raises(AssertionError): copy(a, t)
+        with raises(AssertionError): copy(t, a)
+
+    def test_copy_mismatching_shapes_3d(self, ctx):
+        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F32, 2, 3, 4)
+        a = np.arange(24, dtype=np.float32)
+        copy(a.reshape((2, 3, 4)), t) # OK
+        
+        a = a.reshape((2, 4, 3))
+        with raises(AssertionError): copy(a, t)
+        with raises(AssertionError): copy(t, a)
+
+    def test_copy_mismatching_shapes_4d(self, ctx):
+        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F32, 2, 3, 4, 5)
+        a = np.arange(24*5, dtype=np.float32)
+        copy(a.reshape((2, 3, 4, 5)), t) # OK
+        
+        a = a.reshape((2, 3, 5, 4))
+        with raises(AssertionError): copy(a, t)
+        with raises(AssertionError): copy(t, a)
+
+    def test_copy_f16_to_f32(self, ctx):
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 1)
+        a = np.array([123.45], dtype=np.float16)
+        copy(a, t)
+        np.testing.assert_allclose(lib.ggml_get_f32_1d(t, 0), 123.45, rtol=1e-3)
+
+    def test_copy_f32_to_f16(self, ctx):
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F16, 1)
+        a = np.array([123.45], dtype=np.float32)
+        copy(a, t)
+        np.testing.assert_allclose(lib.ggml_get_f32_1d(t, 0), 123.45, rtol=1e-3)
+
+    def test_copy_f16_to_Q5_K(self, ctx):
+        n = 256
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
+        a = np.arange(n, dtype=np.float16)
+        copy(a, t)
+        np.testing.assert_allclose(a, numpy(t, allow_copy=True), rtol=0.05)
+
+    def test_copy_Q5_K_to_f16(self, ctx):
+        n = 256
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
+        copy(np.arange(n, dtype=np.float32), t)
+        a = np.arange(n, dtype=np.float16)
+        copy(t, a)
+        np.testing.assert_allclose(a, numpy(t, allow_copy=True), rtol=0.05)
+
+    def test_copy_i16_f32_mismatching_types(self, ctx):
+        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 1)
+        a = np.arange(1, dtype=np.int16)
+        with raises(NotImplementedError): copy(a, t)
+        with raises(NotImplementedError): copy(t, a)
+
+class TestTensorCopy:
+
+    def test_copy_self(self, ctx):
+        t = lib.ggml_new_i32(ctx, 42)
+        copy(t, t)
+        assert lib.ggml_get_i32_1d(t, 0) == 42
+
+    def test_copy_1d(self, ctx):
+        t1 = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
+        t2 = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
+        a = np.arange(10, dtype=np.float32)
+        copy(a, t1)
+        copy(t1, t2)
+        assert np.allclose(a, numpy(t2))
+        assert np.allclose(numpy(t1), numpy(t2))
+
+class TestGraph:
+
+    def test_add(self, ctx):
+        n = 256
+        ta = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
+        tb = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
+        tsum = lib.ggml_add(ctx, ta, tb)
+        assert tsum.type == lib.GGML_TYPE_F32
+
+        gf = ffi.new('struct ggml_cgraph*')
+        lib.ggml_build_forward_expand(gf, tsum)
+
+        a = np.arange(0, n, dtype=np.float32)
+        b = np.arange(n, 0, -1, dtype=np.float32)
+        copy(a, ta)
+        copy(b, tb)
+
+        lib.ggml_graph_compute_with_ctx(ctx, gf, 1)
+
+        assert np.allclose(numpy(tsum, allow_copy=True), a + b)
+
+class TestQuantization:
+
+    def test_quantized_add(self, ctx):
+        n = 256
+        ta = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
+        tb = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
+        tsum = lib.ggml_add(ctx, ta, tb)
+        assert tsum.type == lib.GGML_TYPE_Q5_K
+
+        gf = ffi.new('struct ggml_cgraph*')
+        lib.ggml_build_forward_expand(gf, tsum)
+
+        a = np.arange(0, n, dtype=np.float32)
+        b = np.arange(n, 0, -1, dtype=np.float32)
+        copy(a, ta)
+        copy(b, tb)
+
+        lib.ggml_graph_compute_with_ctx(ctx, gf, 1)
+
+        unquantized_sum = a + b
+        sum = numpy(tsum, allow_copy=True)
+
+        diff = np.linalg.norm(unquantized_sum - sum, np.inf)
+        assert diff > 4
+        assert diff < 5
diff --git a/stable-diffusion.cpp/ggml/examples/replit/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/replit/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..696b7f9886cfbce31a9233ff45f4baa45ab55132
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/replit/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# replit
+
+set(TEST_TARGET replit)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# replit-quantize
+
+set(TEST_TARGET replit-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/stable-diffusion.cpp/ggml/examples/replit/convert-h5-to-ggml.py b/stable-diffusion.cpp/ggml/examples/replit/convert-h5-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fc15a97785556e4e04526abdc747f51044c97ea
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/replit/convert-h5-to-ggml.py
@@ -0,0 +1,117 @@
+from pathlib import Path
+import sys
+import struct
+import json
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import sentencepiece.sentencepiece_model_pb2 as model
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+sp_proto = model.ModelProto()
+sp_proto.ParseFromString(open(Path(sys.argv[1]) / "spiece.model", "rb").read())
+
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+
+
+tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    dir_model, low_cpu_mem_usage=True, trust_remote_code=True
+)
+# print (model)
+
+# print(tokenizer.encode('I believe the meaning of life is'))
+
+list_vars = model.state_dict()
+for name in list_vars.keys():
+    print(name, list_vars[name].shape, list_vars[name].dtype)
+
+fout = open(fname_out, "wb")
+
+print(hparams)
+
+fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
+fout.write(struct.pack("i", hparams["d_model"]))
+fout.write(struct.pack("i", hparams["max_seq_len"]))
+fout.write(struct.pack("i", hparams["n_heads"]))
+fout.write(struct.pack("i", hparams["n_layers"]))
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", ftype))
+
+
+# TODO: temporary hack to not deal with implementing the tokenizer
+for piece in sp_proto.pieces:
+    encoded_piece = piece.piece.encode("utf-8")
+    fout.write(struct.pack("i", len(encoded_piece)))
+    fout.write(encoded_piece)
+    fout.write(struct.pack("f", piece.score))
+
+if hparams["vocab_size"] > len(sp_proto.pieces):
+    for i in range(hparams["vocab_size"] - len(sp_proto.pieces)):
+        fout.write(struct.pack("i", 0))
+        fout.write(struct.pack("f", 0))
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if ftype != 0:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    # header
+    str = name.encode("utf-8")
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str)
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/replit/main.cpp b/stable-diffusion.cpp/ggml/examples/replit/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8338cbbdeb681e70dc3c0b0619c5050943cbf95
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/replit/main.cpp
@@ -0,0 +1,795 @@
+#include "ggml/ggml.h"
+
+#include "common-ggml.h"
+#include "common.h"
+
+#include <cassert>
+#include <cmath>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#if defined(_WIN32)
+#define NOMINMAX
+#include <Windows.h>
+bool is_stdin_terminal() {
+    auto in = GetStdHandle(STD_INPUT_HANDLE);
+    return GetFileType(in) == FILE_TYPE_CHAR;
+}
+#else
+#include <unistd.h>
+bool is_stdin_terminal() {
+    return isatty(STDIN_FILENO);
+}
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+using piece_t = std::pair<std::size_t, float>;
+using piece_map_t = std::unordered_map<std::string, piece_t>;
+
+struct replit_tokenizer {
+    gpt_vocab raw_vocab;
+    piece_map_t piece_map;
+    std::vector<std::string> vocab;
+};
+
+std::pair<std::vector<std::size_t>, float> encode_word(const std::string & word, const piece_map_t & model) {
+    std::vector<int> best_segmentations_starts(word.length() + 1, -1);
+    best_segmentations_starts[0] = 0;
+
+    std::vector<float> best_segmentations_scores(word.length() + 1, -std::numeric_limits<float>::infinity());
+    best_segmentations_scores[0] = 1.0;
+
+    for (size_t start_idx = 0; start_idx < word.length(); ++start_idx) {
+        float best_score_at_start = best_segmentations_scores[start_idx];
+        for (size_t end_idx = start_idx + 1; end_idx <= word.length(); ++end_idx) {
+            std::string token = word.substr(start_idx, end_idx - start_idx);
+            if (model.count(token) && best_score_at_start != -std::numeric_limits<float>::infinity()) {
+                float token_score = model.at(token).second;
+                float score = token_score + best_score_at_start;
+                if (best_segmentations_scores[end_idx] == -std::numeric_limits<float>::infinity() ||
+                    best_segmentations_scores[end_idx] > score) {
+                    best_segmentations_starts[end_idx] = start_idx;
+                    best_segmentations_scores[end_idx] = score;
+                }
+            }
+        }
+    }
+
+    if (best_segmentations_scores.back() == -std::numeric_limits<float>::infinity()) {
+        return std::make_pair(std::vector<std::size_t>{0}, 0.0f);
+    }
+
+    float score = best_segmentations_scores.back();
+    int start = best_segmentations_starts.back();
+    int end = word.length();
+    std::vector<std::size_t> tokens;
+    while (start != 0) {
+        const auto token_id = model.at(word.substr(start, end - start)).first;
+        tokens.insert(tokens.begin(), token_id);
+        int next_start = best_segmentations_starts[start];
+        end = start;
+        start = next_start;
+    }
+    const auto token_id = model.at(word.substr(start, end - start)).first;
+    tokens.insert(tokens.begin(), token_id);
+    return std::make_pair(tokens, score);
+}
+
+bool replit_tokenizer_load(replit_tokenizer & tokenizer, std::istream & fin, int max_vocab_size) {
+    std::string word;
+    std::vector<char> buf(128);
+
+    for (int i = 0; i < max_vocab_size; i++) {
+        uint32_t len;
+        fin.read((char *)&len, sizeof(len));
+
+        buf.resize(len);
+        fin.read((char *)buf.data(), len);
+        word.assign(buf.data(), len);
+
+        float score;
+        fin.read((char *)&score, sizeof(score));
+
+        tokenizer.piece_map[word] = std::make_pair(i, -score);
+        tokenizer.raw_vocab.id_to_token[i] = word;
+    }
+
+    return true;
+}
+
+std::string replace_all(const std::string & str,    // where to work
+                        const std::string & find,   // substitute 'find'
+                        const std::string & replace //      by 'replace'
+) {
+    using namespace std;
+    string result;
+    size_t find_len = find.size();
+    size_t pos, from = 0;
+    while (string::npos != (pos = str.find(find, from))) {
+        result.append(str, from, pos - from);
+        result.append(replace);
+        from = pos + find_len;
+    }
+    result.append(str, from, string::npos);
+    return result;
+}
+
+std::string ws_symbol = "\342\226\201";
+std::vector<std::size_t> replit_tokenizer_tokenize(replit_tokenizer & tokenizer, const std::string & text) {
+    std::vector<std::size_t> tokens;
+    auto normalized_text = replace_all(text, " ", ws_symbol);
+    auto tokenized = encode_word(normalized_text, tokenizer.piece_map);
+
+    return tokenized.first;
+}
+
+std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std::vector<std::size_t> & tokens) {
+    std::string text;
+    for (auto token : tokens) {
+        text += tokenizer.raw_vocab.id_to_token[token];
+    }
+    auto denormalized_text = replace_all(text, ws_symbol, " ");
+    return denormalized_text;
+}
+
+// no defaults for now
+struct replit_hparams {
+    int32_t d_model = 0;
+    int32_t max_seq_len = 0;
+    int32_t n_heads = 0;
+    int32_t n_layers = 0;
+    int32_t n_vocab = 0;
+    int32_t ftype = 0;
+};
+
+struct replit_layer {
+    // pre normalization
+    struct ggml_tensor * norm_1_weight;
+
+    // attention
+    struct ggml_tensor * c_attn_wqkv_weight;
+    struct ggml_tensor * c_attn_out_proj_weight;
+
+    // post normalization
+    struct ggml_tensor * norm_2_weight;
+
+    // ff
+    struct ggml_tensor * ffn_up_proj;
+    struct ggml_tensor * ffn_down_proj;
+};
+
+struct replit_model {
+    replit_hparams hparams;
+
+    struct ggml_tensor * wte_weight;    // position embedding
+    struct ggml_tensor * norm_f_weight; // language model head
+
+    std::vector<replit_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// load the model's weights from a file
+bool replit_model_load(const std::string & fname, replit_model & model, replit_tokenizer & vocab) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *)&magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
+        fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
+        fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
+        fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
+        fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: d_model      = %d\n", __func__, hparams.d_model);
+        printf("%s: max_seq_len  = %d\n", __func__, hparams.max_seq_len);
+        printf("%s: n_heads      = %d\n", __func__, hparams.n_heads);
+        printf("%s: n_layers     = %d\n", __func__, hparams.n_layers);
+        printf("%s: n_vocab      = %d\n", __func__, hparams.n_vocab);
+        printf("%s: ftype        = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr        = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    replit_tokenizer_load(vocab, fin, model.hparams.n_vocab);
+
+    // for the big tensors, we have the option to store the data in 16-bit
+    // floats or quantized in order to save memory and also to speed up the
+    // computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
+                model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd = hparams.d_model;
+        const int n_layer = hparams.n_layers;
+        const int n_ctx = hparams.max_seq_len;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight
+        ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32);   // ln_f_weight
+
+        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_1_weight
+        ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight
+        ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype));     // attn_out_proj_weight
+        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_2_weight
+        ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
+        ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
+
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
+
+        ctx_size += (1 + 6 * n_layer) * 512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const size_t n_embd = hparams.d_model;
+        const size_t n_layer = hparams.n_layers;
+        const size_t n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+        model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        // map by name
+        model.tensors["transformer.wte.weight"] = model.wte_weight;
+        model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
+
+        for (int i = 0; i < (int)n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd);
+            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+            layer.norm_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.ffn_up_proj = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd);
+            layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd);
+
+            // map by name
+            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
+                layer.c_attn_out_proj_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
+            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd = hparams.d_model;
+        const int n_layer = hparams.n_layers;
+        const int n_ctx = hparams.max_seq_len;
+
+        const int64_t n_mem = n_layer * n_ctx;
+        const int64_t n_elements = n_embd * n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRIu64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = {1, 1};
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr,
+                        "%s: tensor '%s' has wrong shape in model file: got [%5d, "
+                        "%5d], expected [%5d, %5d]\n",
+                        __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1],
+                       ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr,
+                        "%s: tensor '%s' has wrong size in model file: got %zu, "
+                        "expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool replit_eval(const replit_model & model, const int n_threads, const int n_past,
+                 const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all,
+                 size_t & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd = hparams.d_model;
+    const int n_layer = hparams.n_layers;
+    const int n_head = hparams.n_heads;
+    const int n_vocab = hparams.n_vocab;
+    const int n_ctx = hparams.max_seq_len;
+    const float eps = 1e-5f;
+
+    static size_t buf_size = 256u * 1024 * 1024;
+    static void * buf = malloc(buf_size);
+
+    if (mem_per_token > 0 && mem_per_token * N > buf_size) {
+        const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
+        // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
+        // buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
+
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+
+        struct ggml_tensor * cur;
+
+        // a = self.ln_1(x)
+        {
+            cur = ggml_norm(ctx0, inpL, eps);
+
+            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
+        }
+
+        // self-attention
+        //  b, _, past_key_value = self.attn(a, past_key_value=past_key_value,
+        //  attn_bias=attn_bias, attention_mask=attention_mask,
+        //  is_causal=is_causal)
+        {
+            // compute QKV
+            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
+
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
+
+            // store key and value to memory
+            {
+                struct ggml_tensor * k =
+                    ggml_view_1d(ctx0, model.memory_k, N * n_embd,
+                                 (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
+                struct ggml_tensor * v =
+                    ggml_view_1d(ctx0, model.memory_v, N * n_embd,
+                                 (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
+            // 2, 1, 3) [64, N, 12]
+            struct ggml_tensor * Q = ggml_permute(
+                ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
+                1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
+            // 3) [64, n_past + N, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                             ggml_reshape_3d(ctx0,
+                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
+                                                          il * n_ctx * ggml_element_size(model.memory_k) * n_embd),
+                                             n_embd / n_head, n_head, n_past + N),
+                             0, 2, 1, 3);
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head)));
+
+            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0f);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
+            // 2, 0, 3).contiguous() [n_past + N, 64, 12]
+            struct ggml_tensor * V_trans = ggml_cpy(
+                ctx0,
+                ggml_permute(ctx0,
+                             ggml_reshape_3d(ctx0,
+                                             ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
+                                                          il * n_ctx * ggml_element_size(model.memory_v) * n_embd),
+                                             n_embd / n_head, n_head, n_past + N),
+                             1, 2, 0, 3),
+                ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection
+            { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
+        }
+
+        inpL = ggml_add(ctx0, inpL, cur);
+
+        // m = self.ln_2(x)
+        {
+            cur = ggml_norm(ctx0, inpL, eps);
+
+            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
+        }
+
+        // n = self.mlp(m)
+        {
+
+            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // cur = proj_w*cur + proj_b
+            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
+        }
+
+        // x = x + n
+        inpL = ggml_add(ctx0, inpL, cur);
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, eps);
+        // inpL = ln_f_g*inpL
+        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
+    }
+
+    // output embedding weight tied to input embedding
+    inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
+
+    // logits -> probs
+    // inpL = ggml_soft_max(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    // std::cout << "Qcur" << std::endl;
+    // print_tensor(Qcur);
+
+    // if (n_past%100 == 0) {
+    // ggml_graph_print(&gf);
+    // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot");
+    // }
+
+    if (logits_all) {
+        // return result for all tokens
+        embd_w.resize(n_vocab * N);
+        memcpy(embd_w.data(), (float *)ggml_get_data(inpL), sizeof(float) * n_vocab * N);
+    } else {
+        // return result for just the last token
+        embd_w.resize(n_vocab);
+        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
+    }
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0) / N;
+    }
+    // printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        if (!is_stdin_terminal()) {
+            std::string line;
+            while (std::getline(std::cin, line)) {
+                params.prompt = params.prompt + "\n" + line;
+            }
+        } else {
+            params.prompt = gpt_random_prompt(rng);
+        }
+    }
+
+    int64_t t_load_us = 0;
+
+    replit_tokenizer vocab;
+    replit_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!replit_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<std::size_t> embd_inp = replit_tokenizer_tokenize(vocab, params.prompt);
+
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+
+    for (size_t i = 0; i < embd_inp.size(); i++) {
+        printf("%s: token[%zu] = %6zu\n", __func__, i, embd_inp[i]);
+        // vocab.id_to_token.at(embd_inp[i]).c_str()
+    }
+    printf("\n");
+
+    params.n_predict = std::min(params.n_predict, model.hparams.max_seq_len - (int)embd_inp.size());
+
+    std::vector<gpt_vocab::id> embd;
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+    replit_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
+
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!replit_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab.raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p,
+                                            temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) > params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", replit_tokenizer_detokenize(vocab, {static_cast<std::size_t>(id)}).c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 0) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f,
+               t_predict_us / 1000.0f / n_past);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/replit/quantize.cpp b/stable-diffusion.cpp/ggml/examples/replit/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f274074bb725359ad038ea83518313f9c7c55e82
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/replit/quantize.cpp
@@ -0,0 +1,182 @@
+#include "ggml/ggml.h"
+
+#include "common-ggml.h"
+#include "common.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <string>
+#include <vector>
+
+struct mpt_hparams {
+    int32_t d_model     = 0;
+    int32_t max_seq_len = 0;
+    int32_t n_heads     = 0;
+    int32_t n_layers    = 0;
+    int32_t n_vocab     = 0;
+    int32_t ftype       = 0;
+};
+
+// quantize a model
+bool mpt_model_quantize(const std::string & fname_inp,
+                        const std::string & fname_out, ggml_ftype ftype) {
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__,
+                fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__,
+                fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *)&magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n",
+                    __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *)&magic, sizeof(magic));
+    }
+
+    mpt_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.d_model,     sizeof(hparams.d_model));
+        finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+        finp.read((char *) &hparams.n_heads,     sizeof(hparams.n_heads));
+        finp.read((char *) &hparams.n_layers,    sizeof(hparams.n_layers));
+        finp.read((char *) &hparams.n_vocab,     sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.ftype,       sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: d_model     = %d\n", __func__, hparams.d_model);
+        printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
+        printf("%s: n_heads     = %d\n", __func__, hparams.n_heads);
+        printf("%s: n_layers    = %d\n", __func__, hparams.n_layers);
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.d_model,     sizeof(hparams.d_model));
+        fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+        fout.write((char *) &hparams.n_heads,     sizeof(hparams.n_heads));
+        fout.write((char *) &hparams.n_layers,    sizeof(hparams.n_layers));
+        fout.write((char *) &hparams.n_vocab,     sizeof(hparams.n_vocab));
+        fout.write((char *) &ftype_dst,           sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = hparams.n_vocab;
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read((char *)&len, sizeof(len));
+            fout.write((char *)&len, sizeof(len));
+
+            word.resize(len);
+            finp.read((char *)word.data(), len);
+            fout.write((char *)word.data(), len);
+
+            float prob;
+            finp.read((char *)&prob, sizeof(prob));
+            fout.write((char *)&prob, sizeof(prob));
+        }
+    }
+
+    printf("%s: quantizing tensors\n", __func__);
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        ".*weight",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
+                fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./replit-quantize models/replit/ggml-model.bin
+//  models/replit/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
+                argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = {0, NULL, false};
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n",
+                    __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__,
+               t_quantize_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__,
+               (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/sam/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/sam/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fa302bb402708471dbbe0b11a88e5de3ca4c1aa7
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/sam/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# sam
+
+set(TEST_TARGET sam)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+
+#
+# sam-quantize
+
+#set(TEST_TARGET sam-quantize)
+#add_executable(${TEST_TARGET} quantize.cpp)
+#target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
diff --git a/stable-diffusion.cpp/ggml/examples/sam/README.md b/stable-diffusion.cpp/ggml/examples/sam/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e807c7405dd32075ab18c5da1e05fdd02c82bdd
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/sam/README.md
@@ -0,0 +1,105 @@
+# SAM.cpp
+
+Inference of Meta's [Segment Anything Model](https://github.com/facebookresearch/segment-anything/) in pure C/C++
+
+## Description
+
+The example currently supports only the [ViT-B SAM model checkpoint](https://huggingface.co/facebook/sam-vit-base).
+
+## Next steps
+
+- [X] Reduce memory usage by utilizing the new ggml-alloc
+- [X] Remove redundant graph nodes
+- [ ] Make inference faster
+- [X] Fix the difference in output masks compared to the PyTorch implementation
+- [X] Filter masks based on stability score
+- [ ] Add support for user input
+- [ ] Support F16 for heavy F32 ops
+- [ ] Test quantization
+- [X] Support bigger model checkpoints
+- [ ] GPU support
+
+## Quick start
+```bash
+git clone https://github.com/ggerganov/ggml
+cd ggml
+
+# Install Python dependencies
+python3 -m pip install -r requirements.txt
+
+# Convert PTH model to ggml
+python convert-pth-to-ggml.py examples/sam/sam_vit_b_01ec64.pth . 1
+
+# Build ggml + examples
+mkdir build && cd build
+cmake .. && make -j4
+
+# run inference
+./bin/sam -t 16 -i ../img.jpg -m examples/sam/ggml-model-f16.bin
+```
+
+## Downloading and converting the model checkpoints
+
+You can download a [model checkpoint](https://github.com/facebookresearch/segment-anything/tree/main#model-checkpoints) and convert it to `ggml` format using the script `convert-pth-to-ggml.py`:
+
+```
+# Convert PTH model to ggml
+python convert-pth-to-ggml.py examples/sam/sam_vit_b_01ec64.pth . 1
+```
+
+## Example output on M2 Ultra
+```
+ $ ▶ make -j sam && time ./bin/sam -t 8 -i img.jpg
+[ 28%] Built target common
+[ 71%] Built target ggml
+[100%] Built target sam
+main: seed = 1693224265
+main: loaded image 'img.jpg' (680 x 453)
+sam_image_preprocess: scale = 0.664062
+main: preprocessed image (1024 x 1024)
+sam_model_load: loading model from 'models/sam-vit-b/ggml-model-f16.bin' - please wait ...
+sam_model_load: n_enc_state      = 768
+sam_model_load: n_enc_layer      = 12
+sam_model_load: n_enc_head       = 12
+sam_model_load: n_enc_out_chans  = 256
+sam_model_load: n_pt_embd        = 4
+sam_model_load: ftype            = 1
+sam_model_load: qntvr            = 0
+operator(): ggml ctx size = 202.32 MB
+sam_model_load: ...................................... done
+sam_model_load: model size =   185.05 MB / num tensors = 304
+embd_img
+dims: 64 64 256 1 f32
+First & Last 10 elements:
+-0.05117 -0.06408 -0.07154 -0.06991 -0.07212 -0.07690 -0.07508 -0.07281 -0.07383 -0.06779
+0.01589 0.01775 0.02250 0.01675 0.01766 0.01661 0.01811 0.02051 0.02103 0.03382
+sum:  12736.272313
+
+Skipping mask 0 with iou 0.705935 below threshold 0.880000
+Skipping mask 1 with iou 0.762136 below threshold 0.880000
+Mask 2: iou = 0.947081, stability_score = 0.955437, bbox (371, 436), (144, 168)
+
+
+main:     load time =    51.28 ms
+main:    total time =  2047.49 ms
+
+real	0m2.068s
+user	0m16.343s
+sys	0m0.214s
+```
+
+Input point is (414.375, 162.796875) (currently hardcoded)
+
+Input image:
+
+![llamas](https://user-images.githubusercontent.com/8558655/261301565-37b7bf4b-bf91-40cf-8ec1-1532316e1612.jpg)
+
+Output mask (mask_out_2.png in build folder):
+
+![mask_glasses](https://user-images.githubusercontent.com/8558655/263706800-47eeea30-1457-4c87-938b-8f11536c5aa7.png)
+
+## References
+
+- [ggml](https://github.com/ggerganov/ggml)
+- [SAM](https://segment-anything.com/)
+- [SAM demo](https://segment-anything.com/demo)
diff --git a/stable-diffusion.cpp/ggml/examples/sam/convert-pth-to-ggml.py b/stable-diffusion.cpp/ggml/examples/sam/convert-pth-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..0de422e5517d58de334ec4ecd58aa6528c56fc48
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/sam/convert-pth-to-ggml.py
@@ -0,0 +1,147 @@
+# Convert a SAM model checkpoint to a ggml compatible file
+#
+
+import sys
+import torch
+import struct
+import numpy as np
+
+if len(sys.argv) < 3:
+    print("Usage: convert-pth-to-ggml.py file-model dir-output [ftype]\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+fname_model = sys.argv[1]
+dir_out     = sys.argv[2]
+fname_out   = dir_out + "/ggml-model.bin"
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 3:
+    ftype = int(sys.argv[3])
+
+if ftype < 0 or ftype > 1:
+    print("Invalid ftype: " + str(ftype))
+    sys.exit(1)
+
+fname_out = fname_out.replace(".bin", "-" + ftype_str[ftype] + ".bin")
+
+# Default params are set to sam_vit_b checkpoint
+n_enc_state = 768
+n_enc_layers = 12
+n_enc_heads = 12
+n_enc_out_chans = 256
+n_pt_embd = 4
+
+model = torch.load(fname_model, map_location="cpu")
+for k, v in model.items():
+    print(k, v.shape)
+    if k == "image_encoder.blocks.0.norm1.weight":
+        n_enc_state = v.shape[0]
+
+if n_enc_state == 1024: # sam_vit_l
+    n_enc_layers = 24
+    n_enc_heads  = 16
+elif n_enc_state == 1280: # sam_vit_h
+    n_enc_layers = 32
+    n_enc_heads  = 16
+
+hparams = {
+    "n_enc_state":      n_enc_state,
+    "n_enc_layers":     n_enc_layers,
+    "n_enc_heads":      n_enc_heads,
+    "n_enc_out_chans":  n_enc_out_chans,
+    "n_pt_embd":        n_pt_embd,
+}
+
+print(hparams)
+
+for k, v in model.items():
+    print(k, v.shape)
+
+#exit()
+#code.interact(local=locals())
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["n_enc_state"]))
+fout.write(struct.pack("i", hparams["n_enc_layers"]))
+fout.write(struct.pack("i", hparams["n_enc_heads"]))
+fout.write(struct.pack("i", hparams["n_enc_out_chans"]))
+fout.write(struct.pack("i", hparams["n_pt_embd"]))
+fout.write(struct.pack("i", ftype))
+
+for k, v in model.items():
+    name = k
+    shape = v.shape
+
+    if name[:19] == "prompt_encoder.mask":
+        continue
+
+    print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
+
+    #data = tf.train.load_variable(dir_model, name).squeeze()
+    #data = v.numpy().squeeze()
+    data = v.numpy()
+    n_dims = len(data.shape)
+
+    # for efficiency - transpose some matrices
+    # "model/h.*/attn/c_attn/w"
+    # "model/h.*/attn/c_proj/w"
+    # "model/h.*/mlp/c_fc/w"
+    # "model/h.*/mlp/c_proj/w"
+    #if name[-14:] == "/attn/c_attn/w" or \
+    #   name[-14:] == "/attn/c_proj/w" or \
+    #   name[-11:] == "/mlp/c_fc/w" or \
+    #   name[-13:] == "/mlp/c_proj/w":
+    #    print("  Transposing")
+    #    data = data.transpose()
+
+    dshape = data.shape
+
+    # default type is fp16
+    ftype_cur = 1
+    if ftype == 0 or n_dims == 1 or \
+            name == "image_encoder.pos_embed" or \
+            name.startswith("prompt_encoder") or \
+            name.startswith("mask_decoder.iou_token") or \
+            name.startswith("mask_decoder.mask_tokens"):
+        print("  Converting to float32")
+        data = data.astype(np.float32)
+        ftype_cur = 0
+    else:
+        print("  Converting to float16")
+        data = data.astype(np.float16)
+
+    # reshape the 1D bias into a 4D tensor so we can use ggml_repeat
+    # keep it in F32 since the data is small
+    if name == "image_encoder.patch_embed.proj.bias":
+        data = data.reshape(1, data.shape[0], 1, 1)
+        n_dims = len(data.shape)
+        dshape = data.shape
+
+    print("  New shape: ", dshape)
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
+    fout.write(str)
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/sam/main.cpp b/stable-diffusion.cpp/ggml/examples/sam/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1de41f30a8e6db5d5c5c3441fe911bb981073cbe
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/sam/main.cpp
@@ -0,0 +1,2202 @@
+#define _USE_MATH_DEFINES // for M_PI
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <thread>
+#include <cinttypes>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// default hparams (ViT-B SAM)
+struct sam_hparams {
+    int32_t n_enc_state               = 768;
+    int32_t n_enc_layer               = 12;
+    int32_t n_enc_head                = 12;
+    int32_t n_enc_out_chans           = 256;
+    int32_t n_pt_embd                 = 4;
+    int32_t n_dec_heads               = 8;
+    int32_t ftype                     = 1;
+    float   mask_threshold            = 0.f;
+    float   iou_threshold             = 0.88f;
+    float   stability_score_threshold = 0.95f;
+    float   stability_score_offset    = 1.0f;
+    float   eps                       = 1e-6f;
+    float   eps_decoder_transformer   = 1e-5f;
+
+    int32_t n_enc_head_dim() const { return n_enc_state / n_enc_head; }
+    int32_t n_img_size()     const { return 1024; }
+    int32_t n_window_size()  const { return 14; }
+    int32_t n_patch_size()   const { return 16; }
+    int32_t n_img_embd()     const { return n_img_size() / n_patch_size(); }
+
+    std::vector<int32_t> global_attn_indices() const {
+        switch (n_enc_state) {
+            case  768: return {  2,  5,  8, 11 };
+            case 1024: return {  5, 11, 17, 23 };
+            case 1280: return {  7, 15, 23, 31 };
+            default:
+                {
+                    fprintf(stderr, "%s: unsupported n_enc_state = %d\n", __func__, n_enc_state);
+                } break;
+        };
+
+        return {};
+    }
+
+    bool is_global_attn(int32_t layer) const {
+        const auto indices = global_attn_indices();
+
+        for (const auto & idx : indices) {
+            if (layer == idx) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+};
+
+struct sam_layer_enc {
+    struct ggml_tensor * norm1_w;
+    struct ggml_tensor * norm1_b;
+
+    struct ggml_tensor * rel_pos_w;
+    struct ggml_tensor * rel_pos_h;
+
+    struct ggml_tensor * qkv_w;
+    struct ggml_tensor * qkv_b;
+
+    struct ggml_tensor * proj_w;
+    struct ggml_tensor * proj_b;
+
+    struct ggml_tensor * norm2_w;
+    struct ggml_tensor * norm2_b;
+
+    struct ggml_tensor * mlp_lin1_w;
+    struct ggml_tensor * mlp_lin1_b;
+
+    struct ggml_tensor * mlp_lin2_w;
+    struct ggml_tensor * mlp_lin2_b;
+};
+
+struct sam_encoder_image {
+    struct ggml_tensor * pe;
+
+    struct ggml_tensor * proj_w;
+    struct ggml_tensor * proj_b;
+
+    struct ggml_tensor * neck_conv_0;
+    struct ggml_tensor * neck_norm_0_w;
+    struct ggml_tensor * neck_norm_0_b;
+    struct ggml_tensor * neck_conv_1;
+    struct ggml_tensor * neck_norm_1_w;
+    struct ggml_tensor * neck_norm_1_b;
+
+    std::vector<sam_layer_enc> layers;
+};
+
+struct sam_encoder_prompt {
+    struct ggml_tensor * pe;
+
+    struct ggml_tensor * not_a_pt_embd_w;
+    std::vector<struct ggml_tensor *> pt_embd;
+
+    struct ggml_tensor * no_mask_embd_w;
+    //std::vector<struct ggml_tensor *> mask_down_w;
+    //std::vector<struct ggml_tensor *> mask_down_b;
+};
+
+struct  sam_layer_dec_transformer_attn {
+    // q_proj
+    struct ggml_tensor * q_w;
+    struct ggml_tensor * q_b;
+
+    // k_proj
+    struct ggml_tensor * k_w;
+    struct ggml_tensor * k_b;
+
+    // v_proj
+    struct ggml_tensor * v_w;
+    struct ggml_tensor * v_b;
+
+    // out_proj
+    struct ggml_tensor * out_w;
+    struct ggml_tensor * out_b;
+};
+
+struct sam_layer_dec_transformer {
+    sam_layer_dec_transformer_attn self_attn;
+
+    // norm1
+    struct ggml_tensor * norm1_w;
+    struct ggml_tensor * norm1_b;
+
+    sam_layer_dec_transformer_attn cross_attn_token_to_img;
+
+    // norm2
+    struct ggml_tensor * norm2_w;
+    struct ggml_tensor * norm2_b;
+
+    // mlp.lin1
+    struct ggml_tensor * mlp_lin1_w;
+    struct ggml_tensor * mlp_lin1_b;
+
+    // mlp.lin2
+    struct ggml_tensor * mlp_lin2_w;
+    struct ggml_tensor * mlp_lin2_b;
+
+    // norm3
+    struct ggml_tensor * norm3_w;
+    struct ggml_tensor * norm3_b;
+
+    // norm4
+    struct ggml_tensor * norm4_w;
+    struct ggml_tensor * norm4_b;
+
+    sam_layer_dec_transformer_attn cross_attn_img_to_token;
+};
+
+struct sam_layer_dec_output_hypernet_mlps {
+    // mlps_*.layers.0
+    struct ggml_tensor * w_0;
+    struct ggml_tensor * b_0;
+
+    // mlps_*.layers.1
+    struct ggml_tensor * w_1;
+    struct ggml_tensor * b_1;
+
+    // mlps_*.layers.2
+    struct ggml_tensor * w_2;
+    struct ggml_tensor * b_2;
+};
+
+struct sam_decoder_mask {
+    std::vector<sam_layer_dec_transformer> transformer_layers;
+
+    // trasnformer.final_attn_token_to_image
+    sam_layer_dec_transformer_attn transformer_final_attn_token_to_img;
+
+    // transformer.norm_final
+    struct ggml_tensor * transformer_norm_final_w;
+    struct ggml_tensor * transformer_norm_final_b;
+
+    // output_upscaling.0
+    struct ggml_tensor * output_upscaling_0_w;
+    struct ggml_tensor * output_upscaling_0_b;
+
+    // output_upscaling.1
+    struct ggml_tensor * output_upscaling_1_w;
+    struct ggml_tensor * output_upscaling_1_b;
+
+    // output_upscaling.3
+    struct ggml_tensor * output_upscaling_3_w;
+    struct ggml_tensor * output_upscaling_3_b;
+
+    // output_hypernetworks_mlps
+    std::vector<sam_layer_dec_output_hypernet_mlps> output_hypernet_mlps;
+
+    // iou_prediction_head.0
+    struct ggml_tensor * iou_prediction_head_0_w;
+    struct ggml_tensor * iou_prediction_head_0_b;
+
+    // iou_prediction_head.1
+    struct ggml_tensor * iou_prediction_head_1_w;
+    struct ggml_tensor * iou_prediction_head_1_b;
+
+    // iou_prediction_head.2
+    struct ggml_tensor * iou_prediction_head_2_w;
+    struct ggml_tensor * iou_prediction_head_2_b;
+
+    // iou_token.weight
+    struct ggml_tensor * iou_token_w;
+
+    // mask_tokens.weight
+    struct ggml_tensor * mask_tokens_w;
+};
+
+
+struct sam_state {
+    struct ggml_tensor * embd_img;
+
+    struct ggml_tensor * low_res_masks;
+    struct ggml_tensor * iou_predictions;
+
+    //struct ggml_tensor * tmp_save = {};
+
+    struct ggml_context * ctx;
+
+    // buffer for `ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+    // buffers to evaluate the model
+    std::vector<uint8_t> buf_alloc_img_enc;
+    std::vector<uint8_t> buf_compute_img_enc;
+
+    std::vector<uint8_t> buf_alloc_fast;
+    std::vector<uint8_t> buf_compute_fast;
+
+    struct ggml_allocr  * allocr = {};
+};
+
+// void save_tensor(sam_state& state, struct ggml_tensor * t, struct ggml_cgraph * gf) {
+//     if (!state.tmp_save) {
+//         state.tmp_save = ggml_new_tensor(state.ctx, t->type, t->n_dims, t->ne);
+//     }
+//     struct ggml_tensor * tmp0 = ggml_cpy(state.ctx, t, state.tmp_save);
+//     ggml_build_forward_expand(gf, tmp0);
+// }
+
+struct sam_model {
+    sam_hparams hparams;
+
+    sam_encoder_image  enc_img;
+    sam_encoder_prompt enc_prompt;
+    sam_decoder_mask   dec;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+struct sam_point {
+    float x;
+    float y;
+};
+
+// RGB uint8 image
+struct sam_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> data;
+};
+
+// RGB float32 image
+// Memory layout: RGBRGBRGB...
+struct sam_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> data;
+};
+
+void print_t_f32(const char* title, struct ggml_tensor * t, int n = 10) {
+    printf("%s\n", title);
+    float * data = (float *)t->data;
+    printf("dims: % " PRId64 " % " PRId64 " % " PRId64 " % " PRId64 " f32\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3]);
+    printf("First & Last %d elements:\n", n);
+    for (int i = 0; i < std::min((int) (t->ne[0]*t->ne[1]), n); i++) {
+        printf("%.5f ", data[i]);
+        if (i != 0 && i % t->ne[0] == 0) {
+            printf("\n");
+        }
+    }
+    printf("\n");
+    for (int i = 0; i < std::min((int) (t->ne[0]*t->ne[1]), n); i++) {
+        printf("%.5f ", data[ggml_nelements(t) - n + i]);
+        if ((ggml_nelements(t) - n + i) % t->ne[0] == 0) {
+            printf("\n");
+        }
+    }
+    printf("\n");
+    double sum = 0.0;
+    for (int i = 0; i < ggml_nelements(t); i++) {
+        sum += data[i];
+    }
+    printf("sum:  %f\n\n", sum);
+}
+
+static void ggml_disconnect_node_from_graph(ggml_tensor * t) {
+    t->op = GGML_OP_NONE;
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        t->src[i] = NULL;
+    }
+}
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+static void ggml_sam_sin(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, void * userdata) {
+    GGML_ASSERT(userdata == NULL);
+    GGML_ASSERT(ggml_are_same_shape(dst, src));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src));
+
+    const float * src_data = ggml_get_data_f32(src);
+    float * dst_data = ggml_get_data_f32(dst);
+
+    const int ne = (int)ggml_nelements(dst);
+    const int dr = (ne + nth - 1) / nth;
+    const int ie0 = dr * ith;
+    const int ie1 = std::min(ie0 + dr, ne);
+
+    for (int i = ie0; i < ie1; ++i) {
+        dst_data[i] = sinf(src_data[i]);
+    }
+}
+
+static void ggml_sam_cos(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, void * userdata) {
+    GGML_ASSERT(userdata == NULL);
+    GGML_ASSERT(ggml_are_same_shape(dst, src));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src));
+
+    const float * src_data = ggml_get_data_f32(src);
+    float * dst_data = ggml_get_data_f32(dst);
+
+    const int ne = (int)ggml_nelements(dst);
+    const int dr = (ne + nth - 1) / nth;
+    const int ie0 = dr * ith;
+    const int ie1 = std::min(ie0 + dr, ne);
+
+    for (int i = ie0; i < ie1; ++i) {
+        dst_data[i] = cosf(src_data[i]);
+    }
+}
+
+bool sam_image_load_from_file(const std::string & fname, sam_image_u8 & img) {
+    int nx, ny, nc;
+    auto data = stbi_load(fname.c_str(), &nx, &ny, &nc, 3);
+    if (!data) {
+        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    img.nx = nx;
+    img.ny = ny;
+    img.data.resize(nx * ny * 3);
+    memcpy(img.data.data(), data, nx * ny * 3);
+
+    stbi_image_free(data);
+
+    return true;
+}
+
+// ref: https://github.com/facebookresearch/segment-anything/blob/efeab7296ab579d4a261e554eca80faf6b33924a/segment_anything/modeling/sam.py#L164
+// resize largest dimension to 1024
+// normalize: x = (x - mean) / std
+//     mean = [123.675, 116.28, 103.53]
+//     std  = [58.395, 57.12, 57.375]
+//     TODO: why are these hardcoded !?
+// pad to 1024x1024
+// TODO: for some reason, this is not numerically identical to pytorch's interpolation
+bool sam_image_preprocess(const sam_image_u8 & img, sam_image_f32 & res) {
+    const int nx = img.nx;
+    const int ny = img.ny;
+
+    const int nx2 = 1024;
+    const int ny2 = 1024;
+
+    res.nx = nx2;
+    res.ny = ny2;
+    res.data.resize(3*nx2*ny2);
+
+    const float scale = std::max(nx, ny) / 1024.0f;
+
+    fprintf(stderr, "%s: scale = %f\n", __func__, scale);
+
+    const int nx3 = int(nx/scale + 0.5f);
+    const int ny3 = int(ny/scale + 0.5f);
+
+    const float m3[3] = { 123.675f, 116.280f, 103.530f };
+    const float s3[3] = {  58.395f,  57.120f,  57.375f };
+
+    for (int y = 0; y < ny3; y++) {
+        for (int x = 0; x < nx3; x++) {
+            for (int c = 0; c < 3; c++) {
+                // linear interpolation
+                const float sx = (x + 0.5f)*scale - 0.5f;
+                const float sy = (y + 0.5f)*scale - 0.5f;
+
+                const int x0 = std::max(0, (int) std::floor(sx));
+                const int y0 = std::max(0, (int) std::floor(sy));
+
+                const int x1 = std::min(x0 + 1, nx - 1);
+                const int y1 = std::min(y0 + 1, ny - 1);
+
+                const float dx = sx - x0;
+                const float dy = sy - y0;
+
+                const int j00 = 3*(y0*nx + x0) + c;
+                const int j01 = 3*(y0*nx + x1) + c;
+                const int j10 = 3*(y1*nx + x0) + c;
+                const int j11 = 3*(y1*nx + x1) + c;
+
+                const float v00 = img.data[j00];
+                const float v01 = img.data[j01];
+                const float v10 = img.data[j10];
+                const float v11 = img.data[j11];
+
+                const float v0 = v00*(1.0f - dx) + v01*dx;
+                const float v1 = v10*(1.0f - dx) + v11*dx;
+
+                const float v = v0*(1.0f - dy) + v1*dy;
+
+                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
+
+                const int i = 3*(y*nx3 + x) + c;
+
+                res.data[i] = (float(v2) - m3[c]) / s3[c];
+            }
+        }
+    }
+
+    return true;
+}
+
+// load the model's weights from a file
+bool sam_model_load(const std::string & fname, sam_model & model) {
+    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_enc_state,     sizeof(hparams.n_enc_state));
+        fin.read((char *) &hparams.n_enc_layer,     sizeof(hparams.n_enc_layer));
+        fin.read((char *) &hparams.n_enc_head,      sizeof(hparams.n_enc_head));
+        fin.read((char *) &hparams.n_enc_out_chans, sizeof(hparams.n_enc_out_chans));
+        fin.read((char *) &hparams.n_pt_embd,       sizeof(hparams.n_pt_embd));
+        fin.read((char *) &hparams.ftype,           sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_enc_state      = %d\n", __func__, hparams.n_enc_state);
+        printf("%s: n_enc_layer      = %d\n", __func__, hparams.n_enc_layer);
+        printf("%s: n_enc_head       = %d\n", __func__, hparams.n_enc_head);
+        printf("%s: n_enc_out_chans  = %d\n", __func__, hparams.n_enc_out_chans);
+        printf("%s: n_pt_embd        = %d\n", __func__, hparams.n_pt_embd);
+        printf("%s: ftype            = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr            = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    const size_t ctx_size = [&]() {
+        size_t ctx_size = 0;
+
+        const auto & hparams = model.hparams;
+
+        const int32_t n_enc_state     = hparams.n_enc_state;
+        const int32_t n_enc_layer     = hparams.n_enc_layer;
+        const int32_t n_enc_head_dim  = hparams.n_enc_head_dim();
+        const int32_t n_enc_out_chans = hparams.n_enc_out_chans;
+        const int32_t n_pt_embd       = hparams.n_pt_embd;
+
+        const int32_t n_enc_layer_local  = hparams.global_attn_indices().size();
+        const int32_t n_enc_layer_global = n_enc_layer - n_enc_layer_local;
+
+        const int32_t n_img_embd    = hparams.n_img_embd();
+        const int32_t n_window_size = hparams.n_window_size();
+        const int32_t n_patch_size  = hparams.n_patch_size();
+
+        // image encoder
+        {
+            ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size +=     n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16);
+
+            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+        }
+
+        // image encoder layers
+        {
+            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
+
+            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
+
+            ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_layer*3*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_layer*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+
+            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+        }
+
+        ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead();
+
+        // prompt encoder
+        {
+            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2)
+
+            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+        }
+
+        ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead();
+
+        // mask decoder
+        {
+            //transformer
+            {
+                const int tfm_layers_count = 2;
+                const int qkv_count = 3;
+                const int norm_count = 4;
+                const int n_hypernet_mpls_count = 4;
+
+                // self_attn
+                ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += tfm_layers_count*qkv_count*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += tfm_layers_count*n_enc_state*                      ggml_type_sizef(GGML_TYPE_F32);
+
+                // all norms
+                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+
+                // cross_attn_token_to_img
+                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
+
+                // mlp
+                ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += tfm_layers_count*8*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += tfm_layers_count*n_enc_out_chans*                  ggml_type_sizef(GGML_TYPE_F32);
+
+                // cross_attn_img_to_token
+                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
+
+                // transformer_final_attn_token_to_img
+                ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
+
+                // transformer_norm_final
+                ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+
+                // output_upscaling
+                ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += 3*n_img_embd*                  ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += (n_img_embd/2)*                               ggml_type_sizef(GGML_TYPE_F32);
+
+                // output_hypernetworks_mlps
+                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += n_hypernet_mpls_count*(n_img_embd/2)*                ggml_type_sizef(GGML_TYPE_F32);
+
+                // iou_prediction_head
+                ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += 2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
+                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                ctx_size += n_pt_embd*                ggml_type_sizef(GGML_TYPE_F32);
+
+                // iou_token_w
+                ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+
+                // mask_tokens_w
+                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            }
+        }
+        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+
+        return ctx_size;
+    }();
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        ctx = ggml_init(params);
+        if (!ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int32_t n_enc_state      = hparams.n_enc_state;
+        const int32_t n_enc_layer      = hparams.n_enc_layer;
+        const int32_t n_enc_head_dim   = hparams.n_enc_head_dim();
+        const int32_t n_enc_out_chans  = hparams.n_enc_out_chans;
+        const int32_t n_pt_embd        = hparams.n_pt_embd;
+
+        const int32_t n_img_embd    = hparams.n_img_embd();
+        const int32_t n_window_size = hparams.n_window_size();
+        const int32_t n_patch_size  = hparams.n_patch_size();
+
+        model.enc_img.layers.resize(n_enc_layer);
+
+        // image encoder
+        {
+            auto & enc = model.enc_img;
+
+            enc.pe = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_enc_state, n_img_embd, n_img_embd, 1);
+
+            enc.proj_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, n_patch_size, n_patch_size,           3, n_enc_state);
+            enc.proj_b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32,            1,            1, n_enc_state);
+
+            enc.neck_conv_0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, n_enc_state,     n_enc_out_chans);
+            enc.neck_conv_1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, n_enc_out_chans, n_enc_out_chans);
+
+            enc.neck_norm_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+            enc.neck_norm_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+            enc.neck_norm_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+            enc.neck_norm_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+            model.tensors["image_encoder.pos_embed"] = enc.pe;
+
+            model.tensors["image_encoder.patch_embed.proj.weight"] = enc.proj_w;
+            model.tensors["image_encoder.patch_embed.proj.bias"]   = enc.proj_b;
+
+            model.tensors["image_encoder.neck.0.weight"] = enc.neck_conv_0;
+            model.tensors["image_encoder.neck.2.weight"] = enc.neck_conv_1;
+
+            model.tensors["image_encoder.neck.1.weight"] = enc.neck_norm_0_w;
+            model.tensors["image_encoder.neck.1.bias"]   = enc.neck_norm_0_b;
+
+            model.tensors["image_encoder.neck.3.weight"] = enc.neck_norm_1_w;
+            model.tensors["image_encoder.neck.3.bias"]   = enc.neck_norm_1_b;
+
+            for (int i = 0; i < n_enc_layer; ++i) {
+                auto & layer = enc.layers[i];
+
+                layer.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
+                layer.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
+
+                if (hparams.is_global_attn(i)) {
+                    layer.rel_pos_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_img_embd - 1);
+                    layer.rel_pos_h = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_img_embd - 1);
+                } else {
+                    layer.rel_pos_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_window_size - 1);
+                    layer.rel_pos_h = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_window_size - 1);
+                }
+
+                layer.qkv_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,   n_enc_state, 3*n_enc_state);
+                layer.qkv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_enc_state);
+
+                layer.proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,  n_enc_state,   n_enc_state);
+                layer.proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  n_enc_state);
+
+                layer.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
+                layer.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
+
+                layer.mlp_lin1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,   n_enc_state, 4*n_enc_state);
+                layer.mlp_lin1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_enc_state);
+
+                layer.mlp_lin2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 4*n_enc_state,   n_enc_state);
+                layer.mlp_lin2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_enc_state);
+
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm1.weight"] = layer.norm1_w;
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm1.bias"]   = layer.norm1_b;
+
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.rel_pos_w"] = layer.rel_pos_w;
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.rel_pos_h"] = layer.rel_pos_h;
+
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.qkv.weight"] = layer.qkv_w;
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.qkv.bias"]   = layer.qkv_b;
+
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.proj.weight"] = layer.proj_w;
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.proj.bias"]   = layer.proj_b;
+
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm2.weight"] = layer.norm2_w;
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm2.bias"]   = layer.norm2_b;
+
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin1.weight"] = layer.mlp_lin1_w;
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin1.bias"]   = layer.mlp_lin1_b;
+
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin2.weight"] = layer.mlp_lin2_w;
+                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin2.bias"]   = layer.mlp_lin2_b;
+            }
+        }
+
+        // prompt encoder
+        {
+            auto & enc = model.enc_prompt;
+
+            enc.pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans/2, 2);
+
+            enc.not_a_pt_embd_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+            enc.no_mask_embd_w  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+            model.tensors["prompt_encoder.pe_layer.positional_encoding_gaussian_matrix"] = enc.pe;
+            model.tensors["prompt_encoder.not_a_point_embed.weight"] = enc.not_a_pt_embd_w;
+            model.tensors["prompt_encoder.no_mask_embed.weight"]     = enc.no_mask_embd_w;
+
+            enc.pt_embd.resize(n_pt_embd);
+            for (int i = 0; i < n_pt_embd; i++) {
+                enc.pt_embd[i] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                model.tensors["prompt_encoder.point_embeddings." + std::to_string(i) + ".weight"] = enc.pt_embd[i];
+            }
+        }
+
+        // mask decoder
+        {
+            auto & dec = model.dec;
+            auto & tfm_layers = dec.transformer_layers;
+
+            const int tfm_layers_count = 2;
+            tfm_layers.resize(tfm_layers_count);
+            for (int i = 0; i < tfm_layers_count; ++i) {
+                auto& l = tfm_layers[i];
+                l.self_attn.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+                l.self_attn.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                l.self_attn.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+                l.self_attn.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                l.self_attn.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+                l.self_attn.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                l.self_attn.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+                l.self_attn.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                l.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                l.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                l.cross_attn_token_to_img.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+                l.cross_attn_token_to_img.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+                l.cross_attn_token_to_img.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+                l.cross_attn_token_to_img.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+                l.cross_attn_token_to_img.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+                l.cross_attn_token_to_img.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+                l.cross_attn_token_to_img.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
+                l.cross_attn_token_to_img.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                l.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                l.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                l.mlp_lin1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, 8*n_enc_out_chans);
+                l.mlp_lin1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 8*n_enc_out_chans);
+                l.mlp_lin2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 8*n_enc_out_chans, n_enc_out_chans);
+                l.mlp_lin2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                l.norm3_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                l.norm3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                l.norm4_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                l.norm4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                l.cross_attn_img_to_token.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+                l.cross_attn_img_to_token.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+                l.cross_attn_img_to_token.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+                l.cross_attn_img_to_token.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+                l.cross_attn_img_to_token.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+                l.cross_attn_img_to_token.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+                l.cross_attn_img_to_token.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
+                l.cross_attn_img_to_token.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+                const auto prefix = "mask_decoder.transformer.layers." + std::to_string(i) + ".";
+                model.tensors[prefix + "self_attn.q_proj.weight"] = l.self_attn.q_w;
+                model.tensors[prefix + "self_attn.q_proj.bias"]   = l.self_attn.q_b;
+                model.tensors[prefix + "self_attn.k_proj.weight"] = l.self_attn.k_w;
+                model.tensors[prefix + "self_attn.k_proj.bias"]   = l.self_attn.k_b;
+                model.tensors[prefix + "self_attn.v_proj.weight"] = l.self_attn.v_w;
+                model.tensors[prefix + "self_attn.v_proj.bias"]   = l.self_attn.v_b;
+                model.tensors[prefix + "self_attn.out_proj.weight"] = l.self_attn.out_w;
+                model.tensors[prefix + "self_attn.out_proj.bias"]   = l.self_attn.out_b;
+
+                model.tensors[prefix + "norm1.weight"] = l.norm1_w;
+                model.tensors[prefix + "norm1.bias"]   = l.norm1_b;
+
+                model.tensors[prefix + "cross_attn_token_to_image.q_proj.weight"] = l.cross_attn_token_to_img.q_w;
+                model.tensors[prefix + "cross_attn_token_to_image.q_proj.bias"]   = l.cross_attn_token_to_img.q_b;
+                model.tensors[prefix + "cross_attn_token_to_image.k_proj.weight"] = l.cross_attn_token_to_img.k_w;
+                model.tensors[prefix + "cross_attn_token_to_image.k_proj.bias"]   = l.cross_attn_token_to_img.k_b;
+                model.tensors[prefix + "cross_attn_token_to_image.v_proj.weight"] = l.cross_attn_token_to_img.v_w;
+                model.tensors[prefix + "cross_attn_token_to_image.v_proj.bias"]   = l.cross_attn_token_to_img.v_b;
+                model.tensors[prefix + "cross_attn_token_to_image.out_proj.weight"] = l.cross_attn_token_to_img.out_w;
+                model.tensors[prefix + "cross_attn_token_to_image.out_proj.bias"]   = l.cross_attn_token_to_img.out_b;
+
+                model.tensors[prefix + "norm2.weight"] = l.norm2_w;
+                model.tensors[prefix + "norm2.bias"]   = l.norm2_b;
+
+                model.tensors[prefix + "mlp.lin1.weight"] = l.mlp_lin1_w;
+                model.tensors[prefix + "mlp.lin1.bias"]   = l.mlp_lin1_b;
+                model.tensors[prefix + "mlp.lin2.weight"] = l.mlp_lin2_w;
+                model.tensors[prefix + "mlp.lin2.bias"]   = l.mlp_lin2_b;
+
+                model.tensors[prefix + "norm3.weight"] = l.norm3_w;
+                model.tensors[prefix + "norm3.bias"]   = l.norm3_b;
+                model.tensors[prefix + "norm4.weight"] = l.norm4_w;
+                model.tensors[prefix + "norm4.bias"]   = l.norm4_b;
+
+                model.tensors[prefix + "cross_attn_image_to_token.q_proj.weight"] = l.cross_attn_img_to_token.q_w;
+                model.tensors[prefix + "cross_attn_image_to_token.q_proj.bias"]   = l.cross_attn_img_to_token.q_b;
+                model.tensors[prefix + "cross_attn_image_to_token.k_proj.weight"] = l.cross_attn_img_to_token.k_w;
+                model.tensors[prefix + "cross_attn_image_to_token.k_proj.bias"]   = l.cross_attn_img_to_token.k_b;
+                model.tensors[prefix + "cross_attn_image_to_token.v_proj.weight"] = l.cross_attn_img_to_token.v_w;
+                model.tensors[prefix + "cross_attn_image_to_token.v_proj.bias"]   = l.cross_attn_img_to_token.v_b;
+                model.tensors[prefix + "cross_attn_image_to_token.out_proj.weight"] = l.cross_attn_img_to_token.out_w;
+                model.tensors[prefix + "cross_attn_image_to_token.out_proj.bias"]   = l.cross_attn_img_to_token.out_b;
+            }
+
+            dec.transformer_final_attn_token_to_img.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+            dec.transformer_final_attn_token_to_img.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+            dec.transformer_final_attn_token_to_img.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+            dec.transformer_final_attn_token_to_img.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+            dec.transformer_final_attn_token_to_img.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
+            dec.transformer_final_attn_token_to_img.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
+            dec.transformer_final_attn_token_to_img.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
+            dec.transformer_final_attn_token_to_img.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.q_proj.weight"] = dec.transformer_final_attn_token_to_img.q_w;
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.q_proj.bias"]   = dec.transformer_final_attn_token_to_img.q_b;
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.k_proj.weight"] = dec.transformer_final_attn_token_to_img.k_w;
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.k_proj.bias"]   = dec.transformer_final_attn_token_to_img.k_b;
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.v_proj.weight"] = dec.transformer_final_attn_token_to_img.v_w;
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.v_proj.bias"]   = dec.transformer_final_attn_token_to_img.v_b;
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.out_proj.weight"] = dec.transformer_final_attn_token_to_img.out_w;
+            model.tensors["mask_decoder.transformer.final_attn_token_to_image.out_proj.bias"]   = dec.transformer_final_attn_token_to_img.out_b;
+
+            dec.transformer_norm_final_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+            dec.transformer_norm_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+
+            model.tensors["mask_decoder.transformer.norm_final_attn.weight"] = dec.transformer_norm_final_w;
+            model.tensors["mask_decoder.transformer.norm_final_attn.bias"]   = dec.transformer_norm_final_b;
+
+            dec.output_upscaling_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 2, 2, n_img_embd, n_enc_out_chans);
+            dec.output_upscaling_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
+            dec.output_upscaling_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
+            dec.output_upscaling_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
+            dec.output_upscaling_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16,  2, 2, n_img_embd/2, n_img_embd);
+            dec.output_upscaling_3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd/2);
+
+            model.tensors["mask_decoder.output_upscaling.0.weight"] = dec.output_upscaling_0_w;
+            model.tensors["mask_decoder.output_upscaling.0.bias"]   = dec.output_upscaling_0_b;
+            model.tensors["mask_decoder.output_upscaling.1.weight"] = dec.output_upscaling_1_w;
+            model.tensors["mask_decoder.output_upscaling.1.bias"]   = dec.output_upscaling_1_b;
+            model.tensors["mask_decoder.output_upscaling.3.weight"] = dec.output_upscaling_3_w;
+            model.tensors["mask_decoder.output_upscaling.3.bias"]   = dec.output_upscaling_3_b;
+
+            const int n_hypernet_mpls_count = 4;
+            dec.output_hypernet_mlps.resize(n_hypernet_mpls_count);
+            for (int i = 0; i < n_hypernet_mpls_count; ++i) {
+                auto& mlp = dec.output_hypernet_mlps[i];
+
+                mlp.w_0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+                mlp.b_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                mlp.w_1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+                mlp.b_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+                mlp.w_2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_img_embd/2);
+                mlp.b_2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd/2);
+
+                const auto prefix = "mask_decoder.output_hypernetworks_mlps." + std::to_string(i) + ".";
+                model.tensors[prefix + "layers.0.weight"] = mlp.w_0;
+                model.tensors[prefix + "layers.0.bias"]   = mlp.b_0;
+                model.tensors[prefix + "layers.1.weight"] = mlp.w_1;
+                model.tensors[prefix + "layers.1.bias"]   = mlp.b_1;
+                model.tensors[prefix + "layers.2.weight"] = mlp.w_2;
+                model.tensors[prefix + "layers.2.bias"]   = mlp.b_2;
+            }
+
+            dec.iou_prediction_head_0_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+            dec.iou_prediction_head_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+            dec.iou_prediction_head_1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
+            dec.iou_prediction_head_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
+            dec.iou_prediction_head_2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_pt_embd);
+            dec.iou_prediction_head_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_pt_embd);
+
+            dec.iou_token_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans, 1);
+            dec.mask_tokens_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans, n_pt_embd);
+
+            model.tensors["mask_decoder.iou_prediction_head.layers.0.weight"] = dec.iou_prediction_head_0_w;
+            model.tensors["mask_decoder.iou_prediction_head.layers.0.bias"]   = dec.iou_prediction_head_0_b;
+            model.tensors["mask_decoder.iou_prediction_head.layers.1.weight"] = dec.iou_prediction_head_1_w;
+            model.tensors["mask_decoder.iou_prediction_head.layers.1.bias"]   = dec.iou_prediction_head_1_b;
+            model.tensors["mask_decoder.iou_prediction_head.layers.2.weight"] = dec.iou_prediction_head_2_w;
+            model.tensors["mask_decoder.iou_prediction_head.layers.2.bias"]   = dec.iou_prediction_head_2_b;
+
+            model.tensors["mask_decoder.iou_token.weight"] = dec.iou_token_w;
+            model.tensors["mask_decoder.mask_tokens.weight"] = dec.mask_tokens_w;
+        }
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        fprintf(stderr, "%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int64_t nelements = 1;
+            int64_t ne[4] = { 1, 1, 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                int32_t ne_cur;
+                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
+                ne[i] = ne_cur;
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            //printf("ne0 = %jd, ne1 = %jd, ne2 = %jd, ne3 = %jd\n", ne[0], ne[1], ne[2], ne[3]);
+
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %d, expected %d\n",
+                        __func__, name.data(), (int) nelements, (int) ggml_nelements(tensor));
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]\n",
+                        __func__, name.data(),
+                        (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
+                        (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3]);
+                return false;
+            }
+
+            size_t bpe = 0;
+
+            switch (ftype) {
+                case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
+                case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
+                case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
+                case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                default:
+                        {
+                            fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+                            return false;
+                        }
+            };
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), (size_t) nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                fprintf(stderr, ".");
+                fflush(stdout);
+            }
+        }
+
+        if (n_tensors != int(model.tensors.size())) {
+            fprintf(stderr, "%s: model file has %d tensors, but %d tensors were expected\n", __func__, n_tensors, (int) model.tensors.size());
+            return false;
+        }
+
+        fprintf(stderr, " done\n");
+
+        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+struct ggml_tensor * sam_fill_dense_pe(
+            const sam_model   & model,
+          struct ggml_context * ctx0,
+          struct ggml_cgraph  * gf,
+                  sam_state   & state) {
+    const auto & hparams = model.hparams;
+    const auto & enc     = model.enc_prompt;
+
+    const int32_t n_img_embd = hparams.n_img_embd();
+    const float n_img_embd_inv = 1.0f / n_img_embd;
+
+    struct ggml_tensor * xy_embed_stacked = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2, n_img_embd, n_img_embd);
+    ggml_allocr_alloc(state.allocr, xy_embed_stacked);
+
+    if (!ggml_allocr_is_measure(state.allocr)) {
+        float * data = (float *) ggml_get_data(xy_embed_stacked);
+        for (int i = 0; i < n_img_embd; ++i) {
+            const int row = 2*i*n_img_embd;
+            const float y_val = 2 * (i + 0.5f) * n_img_embd_inv - 1;
+            for (int j = 0; j < n_img_embd; ++j) {
+                const float x_val = 2 * (j + 0.5f) * n_img_embd_inv - 1;
+                data[row + 2*j + 0] = x_val;
+                data[row + 2*j + 1] = y_val;
+            }
+        }
+    }
+
+    struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), xy_embed_stacked);
+
+    cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI)));
+
+    // concat
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
+    {
+        struct ggml_tensor * t_sin = ggml_map_custom1(ctx0, cur, ggml_sam_sin, GGML_N_TASKS_MAX, NULL);
+        struct ggml_tensor * t_cos = ggml_map_custom1(ctx0, cur, ggml_sam_cos, GGML_N_TASKS_MAX, NULL);
+
+        cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, t_sin->ne[0] + t_cos->ne[0], cur->ne[1], cur->ne[2]);
+
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_sin, ggml_view_3d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], t_sin->ne[2], cur->nb[1], cur->nb[2], 0)));
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_cos, ggml_view_3d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], t_sin->ne[2], cur->nb[1], cur->nb[2], t_sin->nb[1])));
+    }
+
+    struct ggml_tensor * pe_img_dense = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+    ggml_build_forward_expand(gf, pe_img_dense);
+
+    return pe_img_dense;
+}
+
+struct ggml_tensor* sam_layer_norm_2d(
+                    struct ggml_context * ctx0,
+                    struct ggml_tensor  * layer,
+                    int                   n_channels,
+                    struct ggml_tensor  * w,
+                    struct ggml_tensor  * b,
+                    float                 eps) {
+    // LayerNorm2d
+    // normalize along channel dimmension
+    // TODO: better implementation
+    layer = ggml_permute(ctx0,
+                ggml_norm(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, layer, 1, 2, 0, 3)), eps),
+                2, 0, 1, 3);
+
+    layer = ggml_add(ctx0,
+              ggml_mul(ctx0,
+                  ggml_repeat(ctx0, ggml_reshape_3d(ctx0, w, 1, 1, n_channels), layer),
+                  layer),
+              ggml_repeat(ctx0, ggml_reshape_3d(ctx0, b, 1, 1, n_channels), layer));
+
+    return layer;
+}
+
+struct ggml_cgraph  * sam_encode_image(
+            const sam_model & model,
+                  sam_state & state,
+        const sam_image_f32 & img) {
+
+    const auto & hparams = model.hparams;
+    const auto & enc     = model.enc_img;
+
+    const int32_t n_enc_state     = hparams.n_enc_state;
+    const int32_t n_enc_layer     = hparams.n_enc_layer;
+    const int32_t n_enc_head      = hparams.n_enc_head;
+    const int32_t n_enc_head_dim  = hparams.n_enc_head_dim();
+    const int32_t n_enc_out_chans = hparams.n_enc_out_chans;
+    const int32_t n_img_size    = hparams.n_img_size();
+    const int32_t n_window_size = hparams.n_window_size();
+
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ state.buf_compute_img_enc.size(),
+        /*.mem_buffer =*/ state.buf_compute_img_enc.data(),
+        /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
+    };
+
+    struct ggml_context * ctx0   = ggml_init(ggml_params);
+    struct ggml_cgraph  * gf     = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_img_size, n_img_size, 3, 1);
+    ggml_allocr_alloc(state.allocr, inp);
+    if (!ggml_allocr_is_measure(state.allocr)) {
+        float * data = (float *) ggml_get_data(inp);
+
+        const int nx = img.nx;
+        const int ny = img.ny;
+        const int n  = nx*ny;
+
+        GGML_ASSERT(nx == n_img_size && ny == n_img_size);
+
+        for (int k = 0; k < 3; k++) {
+            for (int y = 0; y < ny; y++) {
+                for (int x = 0; x < nx; x++) {
+                    data[k*n + y*nx + x] = img.data[3*(y*nx + x) + k];
+                }
+            }
+        }
+    }
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L392
+    struct ggml_tensor * cur = ggml_conv_2d_sk_p0(ctx0, enc.proj_w, inp);
+    cur = ggml_add_inplace(ctx0,
+            cur,
+            ggml_repeat(ctx0, enc.proj_b, cur));
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L394
+    // keep in F32
+    cur = ggml_cont(ctx0,
+            ggml_permute(ctx0, cur, 1, 2, 0, 3));
+
+    // convert to F16
+    //cur = ggml_cpy(ctx0,
+    //        ggml_permute(ctx0, cur, 1, 2, 0, 3),
+    //        ggml_new_tensor_3d(ctx0, GGML_TYPE_F16, n_enc_state, n_img_embd, n_img_embd));
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L108-L109
+    cur = ggml_add_inplace(ctx0, cur, enc.pe);
+
+    struct ggml_tensor * inpL = cur;
+
+    for (int il = 0; il < n_enc_layer; ++il) {
+        const auto & layer = enc.layers[il];
+
+        // norm
+        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L168
+        {
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_mul(ctx0, cur, layer.norm1_w);
+            cur = ggml_add_inplace(ctx0, cur, layer.norm1_b);
+        }
+
+        const int64_t w0 = cur->ne[1];
+        const int64_t h0 = cur->ne[2];
+
+        if (hparams.is_global_attn(il) == false) {
+            // local attention layer - apply window partition
+            // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L169-L172
+            cur = ggml_win_part(ctx0, cur, n_window_size);
+        }
+
+        const int64_t W = cur->ne[1];
+        const int64_t H = cur->ne[2];
+
+        // self-attention
+        {
+            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+            cur = ggml_add_inplace(ctx0, cur, layer.qkv_b);
+
+            // split qkv into separate tensors
+            // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L225-L229
+            const int B = cur->ne[3];
+
+            cur = ggml_reshape_4d(ctx0, cur, n_enc_state, 3, W*H, B);
+            cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 3, 1, 2));
+
+            struct ggml_tensor * Q;
+            struct ggml_tensor * K;
+            struct ggml_tensor * V;
+
+            Q = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 0*cur->nb[3]);
+            Q = ggml_reshape_4d(ctx0, Q,   n_enc_head_dim, n_enc_head, W*H, B);
+            Q = ggml_cont      (ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+            Q = ggml_reshape_3d(ctx0, Q,   n_enc_head_dim, W*H, B*n_enc_head);
+
+            K = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 1*cur->nb[3]);
+            K = ggml_reshape_4d(ctx0, K,   n_enc_head_dim, n_enc_head, W*H, B);
+            K = ggml_cont      (ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+            K = ggml_reshape_3d(ctx0, K,   n_enc_head_dim, W*H, B*n_enc_head);
+
+            V = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 2*cur->nb[3]);
+            V = ggml_reshape_4d(ctx0, V,   n_enc_head_dim, n_enc_head, W*H, B);
+            V = ggml_cont      (ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // transposed
+            V = ggml_reshape_3d(ctx0, V,   W*H, n_enc_head_dim, B*n_enc_head);
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(n_enc_head_dim))
+                        );
+
+            struct ggml_tensor * rw = ggml_get_rel_pos(ctx0, layer.rel_pos_w, W, W);
+            struct ggml_tensor * rh = ggml_get_rel_pos(ctx0, layer.rel_pos_h, H, H);
+
+            struct ggml_tensor * q_r = ggml_reshape_4d(ctx0, Q, n_enc_head_dim, W, H, B*n_enc_head);
+
+            struct ggml_tensor * rel_w = ggml_cont(ctx0, ggml_permute(ctx0,
+                        ggml_mul_mat(ctx0,
+                            rw,
+                            ggml_cont(ctx0, ggml_permute(ctx0, q_r, 0, 2, 1, 3))),
+                        0, 2, 1, 3));
+            struct ggml_tensor * rel_h = ggml_mul_mat(ctx0, rh, q_r);
+
+            struct ggml_tensor * attn = ggml_add_rel_pos_inplace(ctx0, KQ_scaled, rel_w, rel_h);
+
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, attn);
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            cur =
+                ggml_reshape_4d(ctx0,
+                        ggml_cont(ctx0,
+                            ggml_permute(ctx0,
+                                ggml_reshape_4d(ctx0, KQV, n_enc_head_dim, W*H, n_enc_head, B),
+                                0, 2, 1, 3)),
+                        n_enc_state, W, H, B);
+
+            cur = ggml_mul_mat(ctx0, layer.proj_w, cur);
+            cur = ggml_add_inplace(ctx0, cur, layer.proj_b);
+        }
+
+        if (hparams.is_global_attn(il) == false) {
+            // local attention layer - reverse window partition
+            cur = ggml_win_unpart(ctx0, cur, w0, h0, n_window_size);
+        }
+
+        cur = ggml_add_inplace(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+
+                // cur = mlp_ln_w*cur + mlp_ln_b
+                cur = ggml_mul(ctx0, cur, layer.norm2_w);
+                cur = ggml_add_inplace(ctx0, cur, layer.norm2_b);
+            }
+
+            // fully connected
+            cur = ggml_mul_mat(ctx0, layer.mlp_lin1_w, cur);
+            cur = ggml_add_inplace(ctx0, cur, layer.mlp_lin1_b);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            cur = ggml_mul_mat(ctx0, layer.mlp_lin2_w, cur);
+            cur = ggml_add_inplace(ctx0, cur, layer.mlp_lin2_b);
+        }
+
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 2, 0, 1, 3));
+
+    cur = ggml_conv_2d_sk_p0(ctx0, enc.neck_conv_0, cur);
+
+    cur = sam_layer_norm_2d(ctx0, cur, n_enc_out_chans, enc.neck_norm_0_w, enc.neck_norm_0_b, hparams.eps);
+
+    cur = ggml_conv_2d_s1_ph(ctx0, enc.neck_conv_1, cur);
+
+    cur = sam_layer_norm_2d(ctx0, cur, n_enc_out_chans, enc.neck_norm_1_w, enc.neck_norm_1_b, hparams.eps);
+
+    cur = ggml_cpy(ctx0, cur, state.embd_img);
+
+    ggml_build_forward_expand(gf, cur);
+    ggml_disconnect_node_from_graph(state.embd_img);
+
+    //ggml_graph_print(&gf);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+
+struct prompt_encoder_result {
+    struct ggml_tensor * embd_prompt_sparse = {};
+    struct ggml_tensor * embd_prompt_dense = {};
+};
+
+// encode a prompt
+//
+// - points
+// - boxes
+// - masks
+//
+// TODO: currently just encode a single point for simplicity
+//
+prompt_encoder_result sam_encode_prompt(
+        const sam_model     & model,
+        struct ggml_context * ctx0,
+        struct ggml_cgraph  * gf,
+                  sam_state & state,
+                        int   nx,
+                        int   ny,
+                  sam_point   point) {
+
+    const auto & hparams = model.hparams;
+    const auto & enc = model.enc_prompt;
+
+    // transform points
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py#L276
+    {
+        const int nmax = std::max(nx, ny);
+
+        const float scale = hparams.n_img_size() / (float) nmax;
+
+        const int nx_new = int(nx*scale + 0.5f);
+        const int ny_new = int(ny*scale + 0.5f);
+
+        point.x = point.x*(float(nx_new)/nx) + 0.5f;
+        point.y = point.y*(float(ny_new)/ny) + 0.5f;
+    }
+
+    struct ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2, 2);
+
+    ggml_allocr_alloc(state.allocr, inp);
+    if (!ggml_allocr_is_measure(state.allocr)) {
+        // set the input by converting the [0, 1] coordinates to [-1, 1]
+        float * data = (float *) inp->data;
+
+        data[0] = 2.0f*(point.x / hparams.n_img_size()) - 1.0f;
+        data[1] = 2.0f*(point.y / hparams.n_img_size()) - 1.0f;
+
+        // padding
+        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L81-L85
+        data[2] = 2.0f*(0.0f) - 1.0f;
+        data[3] = 2.0f*(0.0f) - 1.0f;
+    }
+
+    struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), inp);
+
+    cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI)));
+
+    // concat
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
+    {
+        struct ggml_tensor * t_sin = ggml_map_custom1(ctx0, cur, ggml_sam_sin, GGML_N_TASKS_MAX, NULL);
+        struct ggml_tensor * t_cos = ggml_map_custom1(ctx0, cur, ggml_sam_cos, GGML_N_TASKS_MAX, NULL);
+
+        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, t_sin->ne[0] + t_cos->ne[0], cur->ne[1]);
+
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_sin, ggml_view_2d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], cur->nb[1], 0)));
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_cos, ggml_view_2d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], cur->nb[1], t_sin->nb[1])));
+
+        // overwrite label == -1 with not_a_point_embed.weight
+        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L86
+        // TODO: extend for multiple points
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, enc.not_a_pt_embd_w, ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], cur->nb[1])));
+    }
+
+    // add point_embeddings[1] to label == 1
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L90
+    struct ggml_tensor * v = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], 0);
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_add_inplace(ctx0, v, enc.pt_embd[1]), v));
+
+    struct ggml_tensor * embd_prompt_sparse = cur;
+    ggml_build_forward_expand(gf, embd_prompt_sparse);
+
+    struct ggml_tensor * embd_prompt_dense = ggml_repeat(ctx0,
+            ggml_cont(ctx0,
+                ggml_view_3d(ctx0, enc.no_mask_embd_w,
+                    1, 1, enc.no_mask_embd_w->ne[0], enc.no_mask_embd_w->nb[0], enc.no_mask_embd_w->nb[0], 0)),
+            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hparams.n_img_embd(), hparams.n_img_embd(), hparams.n_enc_out_chans));
+
+    ggml_build_forward_expand(gf, embd_prompt_dense);
+
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    prompt_encoder_result res;
+    res.embd_prompt_sparse = embd_prompt_sparse;
+    res.embd_prompt_dense  = embd_prompt_dense;
+    return res;
+}
+
+struct ggml_tensor* sam_decode_mask_transformer_attn(
+    const sam_layer_dec_transformer_attn & attn,
+                      struct ggml_tensor * queries,
+                      struct ggml_tensor * keys,
+                      struct ggml_tensor * values,
+                     struct ggml_context * ctx0,
+                         const sam_model & model) {
+    const auto & hparams = model.hparams;
+    const int n_head = hparams.n_dec_heads;
+
+    struct ggml_tensor * Qcur = {};
+    struct ggml_tensor * Kcur = {};
+    struct ggml_tensor * Vcur = {};
+
+    Qcur = ggml_mul_mat(ctx0, attn.q_w, queries);
+    Qcur = ggml_add_inplace(ctx0, Qcur, attn.q_b);
+
+    Kcur = ggml_mul_mat(ctx0, attn.k_w, keys);
+    Kcur = ggml_add_inplace(ctx0, Kcur, attn.k_b);
+
+    Vcur = ggml_mul_mat(ctx0, attn.v_w, values);
+    Vcur = ggml_add_inplace(ctx0, Vcur, attn.v_b);
+
+    struct ggml_tensor * Q = {};
+    struct ggml_tensor * K = {};
+    struct ggml_tensor * V = {};
+
+    Q = ggml_reshape_4d(ctx0, Qcur, Qcur->ne[0]/n_head, n_head, Qcur->ne[1], Qcur->ne[2]);
+    Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+
+    K = ggml_reshape_4d(ctx0, Kcur, Kcur->ne[0]/n_head, n_head, Kcur->ne[1], Kcur->ne[2]);
+    K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+
+    V = ggml_reshape_4d(ctx0, Vcur, Vcur->ne[0]/n_head, n_head, Vcur->ne[1], Vcur->ne[2]);
+    V = ggml_cont(ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3));
+
+    // Q * K
+    struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+    struct ggml_tensor * KQ_scaled =
+        ggml_scale_inplace(ctx0,
+                KQ,
+                ggml_new_f32(ctx0, 1.0f/sqrt(float(Q->ne[0]))));
+
+    struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
+
+    struct ggml_tensor * KQV = ggml_mul_mat(ctx0, KQ_soft_max, ggml_cont(ctx0, ggml_transpose(ctx0, V)));
+
+    struct ggml_tensor * KQV_merged = ggml_cont(ctx0, ggml_transpose(ctx0, KQV));
+    KQV_merged = ggml_cont(ctx0, ggml_permute(ctx0, KQV_merged, 0, 2, 1, 3));
+    KQV_merged = ggml_reshape_3d(ctx0, KQV_merged, KQV_merged->ne[0]*KQV_merged->ne[1], KQV_merged->ne[2], KQV_merged->ne[3]);
+    KQV_merged = ggml_mul_mat(ctx0, attn.out_w, KQV_merged);
+    KQV_merged = ggml_add_inplace(ctx0, KQV_merged, attn.out_b);
+
+    return KQV_merged;
+}
+
+struct ggml_tensor * sam_decode_mask_mlp_relu_3(
+     struct ggml_tensor * in,
+     struct ggml_tensor * w_0,
+     struct ggml_tensor * b_0,
+     struct ggml_tensor * w_1,
+     struct ggml_tensor * b_1,
+     struct ggml_tensor * w_2,
+     struct ggml_tensor * b_2,
+    struct ggml_context * ctx0) {
+
+    struct ggml_tensor * cur = {};
+    cur = ggml_mul_mat(ctx0, w_0, in);
+    cur = ggml_add_inplace(ctx0, cur, b_0);
+
+    cur = ggml_relu_inplace(ctx0, cur);
+
+    cur = ggml_mul_mat(ctx0, w_1, cur);
+    cur = ggml_add_inplace(ctx0, cur, b_1);
+
+    cur = ggml_relu_inplace(ctx0, cur);
+
+    cur = ggml_mul_mat(ctx0, w_2, cur);
+    cur = ggml_add_inplace(ctx0, cur, b_2);
+
+    return cur;
+}
+
+bool sam_decode_mask(
+                    const sam_model & model,
+        const prompt_encoder_result & prompt,
+                 struct ggml_tensor * pe_img,
+                struct ggml_context * ctx0,
+                struct ggml_cgraph  * gf,
+                          sam_state & state) {
+
+    const auto & hparams = model.hparams;
+    const auto & dec = model.dec;
+    const int n_img_embd = hparams.n_img_embd();
+
+    struct ggml_tensor * tokens = {};
+    {
+        // Concatenate output tokens
+        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L120
+        const auto& sparse = prompt.embd_prompt_sparse;
+
+        tokens = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, dec.iou_token_w->ne[0], dec.iou_token_w->ne[1] + dec.mask_tokens_w->ne[1] + sparse->ne[1], sparse->ne[2]);
+
+        const size_t offsets[3] = { 0, dec.iou_token_w->ne[1]*tokens->nb[1], dec.iou_token_w->ne[1]*tokens->nb[1] + dec.mask_tokens_w->ne[1]*tokens->nb[1] };
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, dec.iou_token_w,   ggml_view_2d(ctx0, tokens, tokens->ne[0], dec.iou_token_w->ne[1],   tokens->nb[1], offsets[0])));
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, dec.mask_tokens_w, ggml_view_2d(ctx0, tokens, tokens->ne[0], dec.mask_tokens_w->ne[1], tokens->nb[1], offsets[1])));
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, sparse,            ggml_view_2d(ctx0, tokens, tokens->ne[0], sparse->ne[1],            tokens->nb[1], offsets[2])));
+        // TODO: Sparse prompt embeddings can have more than one point
+    }
+
+
+    struct ggml_tensor * src = {};
+    struct ggml_tensor * pos_src = {};
+    int srcNE[4] = { 0, 0, 0, 0 };
+    {
+        // Expand per-image data in the batch direction to be per-mask
+        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L125
+        src = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, state.embd_img->ne[0], state.embd_img->ne[1], state.embd_img->ne[2], tokens->ne[2]);
+
+        src = ggml_add(ctx0,
+            ggml_repeat(ctx0,
+                state.embd_img,
+                src),
+            prompt.embd_prompt_dense);
+
+        srcNE[0] = src->ne[0];
+        srcNE[1] = src->ne[1];
+        srcNE[2] = src->ne[2];
+        srcNE[3] = src->ne[3];
+
+        // flatten & permute
+        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L83
+        src = ggml_cont(ctx0, ggml_permute(ctx0,
+            ggml_view_3d(ctx0,
+                src,
+                src->ne[0]*src->ne[1],
+                src->ne[2],
+                src->ne[3],
+                src->nb[2],
+                src->nb[3],
+                0),
+            1, 0, 2, 3));
+
+        pos_src = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, pe_img->ne[0], pe_img->ne[1], pe_img->ne[2], tokens->ne[2]);
+        pos_src = ggml_repeat(ctx0,
+            pe_img,
+            pos_src);
+
+        // flatten & permute
+        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L83
+        pos_src = ggml_cont(ctx0, ggml_permute(ctx0,
+            ggml_view_3d(ctx0,
+                pos_src,
+                pos_src->ne[0]*pos_src->ne[1],
+                pos_src->ne[2],
+                pos_src->ne[3],
+                pos_src->nb[2],
+                pos_src->nb[3],
+                0),
+            1, 0, 2, 3));
+    }
+
+    struct ggml_tensor * queries = tokens;
+    struct ggml_tensor * keys = src;
+    {
+        // Run the transformer
+        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L62
+        for (int i = 0; i < int(model.dec.transformer_layers.size()); ++i) {
+            const auto& tfm_layer = model.dec.transformer_layers[i];
+
+            // Self attention block
+            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L154
+            const bool skip_first_layer_pe = i == 0;
+            if (skip_first_layer_pe) {
+                queries = sam_decode_mask_transformer_attn(tfm_layer.self_attn, queries, queries, queries, ctx0, model);
+            }
+            else {
+                struct ggml_tensor * q_0 = ggml_add(ctx0, queries, tokens);
+
+                struct ggml_tensor * self_attn = sam_decode_mask_transformer_attn(tfm_layer.self_attn, q_0, q_0, queries, ctx0, model);
+                queries = ggml_add(ctx0, queries, self_attn);
+            }
+
+            queries = ggml_norm(ctx0, queries, hparams.eps_decoder_transformer);
+            queries = ggml_add_inplace(ctx0,
+                    ggml_mul(ctx0, queries, tfm_layer.norm1_w),
+                    tfm_layer.norm1_b);
+
+            // Cross attention block, tokens attending to image embedding
+            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L163
+            struct ggml_tensor * q_1 = ggml_add(ctx0, queries, tokens);
+            struct ggml_tensor * k_1 = ggml_add(ctx0, keys, pos_src);
+
+            struct ggml_tensor * cross_attn_token_to_img = sam_decode_mask_transformer_attn(tfm_layer.cross_attn_token_to_img, q_1, k_1, keys, ctx0, model);
+
+            queries = ggml_add_inplace(ctx0, queries, cross_attn_token_to_img);
+            queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
+            queries = ggml_add_inplace(ctx0,
+                    ggml_mul(ctx0, queries, tfm_layer.norm2_w),
+                    tfm_layer.norm2_b);
+
+            // MLP block
+            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L170
+            struct ggml_tensor * mlp_out = ggml_mul_mat(ctx0,
+                tfm_layer.mlp_lin1_w,
+                queries);
+
+            mlp_out = ggml_add_inplace(ctx0, mlp_out, tfm_layer.mlp_lin1_b);
+
+            // RELU activation
+            mlp_out = ggml_relu_inplace(ctx0, mlp_out);
+            mlp_out = ggml_mul_mat(ctx0, tfm_layer.mlp_lin2_w, mlp_out);
+
+            mlp_out = ggml_add_inplace(ctx0, mlp_out, tfm_layer.mlp_lin2_b);
+
+            queries = ggml_add_inplace(ctx0, queries, mlp_out);
+            queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
+            queries = ggml_add_inplace(ctx0,
+                    ggml_mul(ctx0, queries, tfm_layer.norm3_w),
+                    tfm_layer.norm3_b);
+
+            // Cross attention block, image embedding attending to tokens
+            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L175
+            struct ggml_tensor * q_2 = ggml_add(ctx0, queries, tokens);
+            struct ggml_tensor * k_2 = ggml_add(ctx0, keys, pos_src);
+
+            struct ggml_tensor * cross_attn_img_to_token = sam_decode_mask_transformer_attn(tfm_layer.cross_attn_img_to_token, k_2, q_2, queries, ctx0, model);
+            keys = ggml_add_inplace(ctx0, keys, cross_attn_img_to_token);
+            keys = ggml_norm_inplace(ctx0, keys, hparams.eps_decoder_transformer);
+            keys = ggml_add_inplace(ctx0,
+                    ggml_mul(ctx0, keys, tfm_layer.norm4_w),
+                    tfm_layer.norm4_b);
+        }
+
+        // Apply the final attention layer from the points to the image
+        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L99
+        struct ggml_tensor * q = ggml_add(ctx0, queries, tokens);
+        struct ggml_tensor * k = ggml_add(ctx0, keys, pos_src);
+
+        struct ggml_tensor * final_attn_token_to_img = sam_decode_mask_transformer_attn(dec.transformer_final_attn_token_to_img, q, k, keys, ctx0, model);
+
+        queries = ggml_add_inplace(ctx0, queries, final_attn_token_to_img);
+        queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
+        queries = ggml_add_inplace(ctx0,
+                ggml_mul(ctx0, queries, dec.transformer_norm_final_w),
+                dec.transformer_norm_final_b);
+    }
+
+
+    struct ggml_tensor * iou_pred = ggml_view_2d(ctx0, queries, queries->ne[0], queries->ne[2], queries->nb[2], 0);
+    const int num_mask_tokens = 4; // num_multimask_outputs + 1
+    struct ggml_tensor * mask_tokens_out = ggml_view_3d(ctx0, queries, queries->ne[0], num_mask_tokens, queries->ne[2], queries->nb[1], num_mask_tokens*queries->nb[1], queries->nb[1]);
+
+    // Upscale mask embeddings and predict masks using the mask tokens
+    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L136
+    keys = ggml_cont(ctx0, ggml_transpose(ctx0, keys));
+    keys = ggml_view_4d(ctx0, keys, srcNE[0], srcNE[1], srcNE[2], srcNE[3], srcNE[0]*keys->nb[0], keys->nb[1], keys->nb[2], 0);
+    // ggml_build_forward_expand(gf, keys);
+    struct ggml_tensor * upscaled_embedding = {};
+    {
+        // ConvTranspose2d
+        keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2);
+        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
+        keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0,
+                                     ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]),
+                                     keys));
+
+        keys = sam_layer_norm_2d(ctx0, keys, n_img_embd, dec.output_upscaling_1_w, dec.output_upscaling_1_b, hparams.eps);
+
+        // GELU activation
+        keys = ggml_gelu_inplace(ctx0, keys);
+
+        // ConvTranspose2d
+        keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2);
+        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
+        keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0,
+                                ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]),
+                                keys), keys);
+        // GELU activation
+        keys = ggml_gelu_inplace(ctx0, keys);
+        upscaled_embedding = ggml_reshape_3d(ctx0, keys, keys->ne[0]*keys->ne[1], keys->ne[2], keys->ne[3]);
+        upscaled_embedding = ggml_cont(ctx0, ggml_transpose(ctx0, upscaled_embedding)); // TODO: Shouldn't be needed
+    }
+
+    struct ggml_tensor * hyper_in = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_img_embd/2, num_mask_tokens, mask_tokens_out->ne[2]);
+
+    for (int i = 0; i < num_mask_tokens; ++i) {
+        const auto& mlp = dec.output_hypernet_mlps[i];
+        struct ggml_tensor * in = ggml_view_2d(ctx0, mask_tokens_out, mask_tokens_out->ne[0], mask_tokens_out->ne[2], mask_tokens_out->nb[1], i*mask_tokens_out->nb[1]);
+        struct ggml_tensor * out = sam_decode_mask_mlp_relu_3(in, mlp.w_0, mlp.b_0, mlp.w_1, mlp.b_1, mlp.w_2, mlp.b_2, ctx0);
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, out, ggml_view_2d(ctx0, hyper_in, hyper_in->ne[0], hyper_in->ne[2], hyper_in->nb[1], i*hyper_in->nb[1])));
+    }
+
+    struct ggml_tensor * masks = ggml_mul_mat(ctx0, hyper_in, upscaled_embedding);
+    masks = ggml_cont(ctx0, ggml_transpose(ctx0, masks)); // TODO: Shouldn't be needed
+    masks = ggml_reshape_4d(ctx0, masks, keys->ne[0], keys->ne[1], masks->ne[1], keys->ne[3]);
+
+    // Generate mask quality predictions
+    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L146
+    iou_pred = sam_decode_mask_mlp_relu_3(iou_pred, dec.iou_prediction_head_0_w, dec.iou_prediction_head_0_b, dec.iou_prediction_head_1_w, dec.iou_prediction_head_1_b, dec.iou_prediction_head_2_w, dec.iou_prediction_head_2_b, ctx0);
+
+    // Select the correct mask or masks for output
+    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L101
+    iou_pred = ggml_cpy(state.ctx, ggml_view_1d(ctx0, iou_pred, iou_pred->ne[0] - 1, iou_pred->nb[0]), state.iou_predictions);
+    masks = ggml_view_4d(ctx0, masks, masks->ne[0], masks->ne[1], masks->ne[2] - 1, masks->ne[3],
+                                      masks->nb[1], masks->nb[2], masks->nb[3], masks->nb[2] /* offset*/);
+    masks = ggml_cpy(state.ctx, masks, state.low_res_masks);
+
+    ggml_build_forward_expand(gf, masks);
+    ggml_build_forward_expand(gf, iou_pred);
+
+    ggml_disconnect_node_from_graph(state.low_res_masks);
+    ggml_disconnect_node_from_graph(state.iou_predictions);
+
+    return true;
+}
+
+bool sam_write_masks(const sam_hparams& hparams, int nx, int ny, const sam_state & state) {
+    if (state.low_res_masks->ne[2] == 0) return true;
+    if (state.low_res_masks->ne[2] != state.iou_predictions->ne[0]) {
+        printf("Error: number of masks (%d) does not match number of iou predictions (%d)\n", (int)state.low_res_masks->ne[2], (int)state.iou_predictions->ne[0]);
+        return false;
+    }
+
+    const int n_img_size = hparams.n_img_size();
+    const float mask_threshold = hparams.mask_threshold;
+    const float iou_threshold = hparams.iou_threshold;
+    const float stability_score_threshold = hparams.stability_score_threshold;
+    const float intersection_threshold = mask_threshold + hparams.stability_score_offset;
+    const float union_threshold = mask_threshold - hparams.stability_score_offset;
+
+    const int ne0 = state.low_res_masks->ne[0];
+    const int ne1 = state.low_res_masks->ne[1];
+    const int ne2 = state.low_res_masks->ne[2];
+
+    // Remove padding and upscale masks to the original image size.
+    // ref: https://github.com/facebookresearch/segment-anything/blob/efeab7296ab579d4a261e554eca80faf6b33924a/segment_anything/modeling/sam.py#L140
+
+    const float preprocess_scale = std::max(nx, ny) / float(n_img_size);
+    const int cropped_nx = int(nx / preprocess_scale + 0.5f);
+    const int cropped_ny = int(ny / preprocess_scale + 0.5f);
+
+    const float scale_x_1 = (float)ne0 / (float)n_img_size;
+    const float scale_y_1 = (float)ne1 / (float)n_img_size;
+
+    const float scale_x_2 = float(cropped_nx) / float(nx);
+    const float scale_y_2 = float(cropped_ny) / float(ny);
+
+    const auto iou_data = (float*)state.iou_predictions->data;
+
+    for (int i = 0; i < ne2; ++i) {
+        if (iou_threshold > 0.f && iou_data[i] < iou_threshold) {
+            printf("Skipping mask %d with iou %f below threshold %f\n", i, iou_data[i], iou_threshold);
+            continue; // Filtering masks with iou below the threshold
+        }
+
+        std::vector<float> mask_data(n_img_size*n_img_size);
+        {
+            const float* data = (float *) state.low_res_masks->data + i*ne0*ne1;
+
+            for (int iy = 0; iy < n_img_size; ++iy) {
+                for (int ix = 0; ix < n_img_size; ++ix) {
+                    const float sx = std::max(scale_x_1*(ix + 0.5f) - 0.5f, 0.0f);
+                    const float sy = std::max(scale_y_1*(iy + 0.5f) - 0.5f, 0.0f);
+
+                    const int x0 = std::max(0, (int)sx);
+                    const int y0 = std::max(0, (int)sy);
+
+                    const int x1 = std::min(x0 + 1, ne0 - 1);
+                    const int y1 = std::min(y0 + 1, ne1 - 1);
+
+                    const float dx = sx - x0;
+                    const float dy = sy - y0;
+
+                    const int j00 = y0*ne0 + x0;
+                    const int j01 = y0*ne0 + x1;
+                    const int j10 = y1*ne0 + x0;
+                    const int j11 = y1*ne0 + x1;
+
+                    const float v00 = data[j00];
+                    const float v01 = data[j01];
+                    const float v10 = data[j10];
+                    const float v11 = data[j11];
+
+                    const float v0 = (1-dx)*v00 + dx*v01;
+                    const float v1 = (1-dx)*v10 + dx*v11;
+
+                    const float v = (1-dy)*v0 + dy*v1;
+
+                    mask_data[iy*n_img_size + ix] = v;
+                }
+            }
+        }
+
+        int intersections = 0;
+        int unions = 0;
+        sam_image_u8 res;
+        int min_iy = ny;
+        int max_iy = 0;
+        int min_ix = nx;
+        int max_ix = 0;
+        {
+            const float* data = mask_data.data();
+
+            res.nx = nx;
+            res.ny = ny;
+            res.data.resize(nx*ny);
+
+            for (int iy = 0; iy < ny; ++iy) {
+                for (int ix = 0; ix < nx; ++ix) {
+                    const float sx = std::max(scale_x_2*(ix + 0.5f) - 0.5f, 0.0f);
+                    const float sy = std::max(scale_y_2*(iy + 0.5f) - 0.5f, 0.0f);
+
+                    const int x0 = std::max(0, (int)sx);
+                    const int y0 = std::max(0, (int)sy);
+
+                    const int x1 = std::min(x0 + 1, cropped_nx - 1);
+                    const int y1 = std::min(y0 + 1, cropped_ny - 1);
+
+                    const float dx = sx - x0;
+                    const float dy = sy - y0;
+
+                    const int j00 = y0*n_img_size + x0;
+                    const int j01 = y0*n_img_size + x1;
+                    const int j10 = y1*n_img_size + x0;
+                    const int j11 = y1*n_img_size + x1;
+
+                    const float v00 = data[j00];
+                    const float v01 = data[j01];
+                    const float v10 = data[j10];
+                    const float v11 = data[j11];
+
+                    const float v0 = (1-dx)*v00 + dx*v01;
+                    const float v1 = (1-dx)*v10 + dx*v11;
+
+                    const float v = (1-dy)*v0 + dy*v1;
+
+                    if (v > intersection_threshold) {
+                        intersections++;
+                    }
+                    if (v > union_threshold) {
+                        unions++;
+                    }
+                    if (v > mask_threshold) {
+                        min_iy = std::min(min_iy, iy);
+                        max_iy = std::max(max_iy, iy);
+                        min_ix = std::min(min_ix, ix);
+                        max_ix = std::max(max_ix, ix);
+
+                        res.data[iy*nx + ix] = 255;
+                    }
+                }
+            }
+        }
+
+        const float stability_score = float(intersections) / float(unions);
+        if (stability_score_threshold > 0.f && stability_score < stability_score_threshold) {
+            printf("Skipping mask %d with stability score %f below threshold %f\n", i, stability_score, stability_score_threshold);
+            continue; // Filtering masks with stability score below the threshold
+        }
+
+        printf("Mask %d: iou = %f, stability_score = %f, bbox (%d, %d), (%d, %d)\n",
+                i, iou_data[i], stability_score, min_ix, max_ix, min_iy, max_iy);
+
+        std::string filename = "mask_out_" + std::to_string(i) + ".png";
+        if (!stbi_write_png(filename.c_str(), res.nx, res.ny, 1, res.data.data(), res.nx)) {
+            printf("%s: failed to write mask %s\n", __func__, filename.c_str());
+            return false;
+        }
+    }
+
+
+    return true;
+}
+
+struct ggml_cgraph  * sam_build_fast_graph(
+        const sam_model     & model,
+                  sam_state & state,
+                        int   nx,
+                        int   ny,
+                  sam_point   point) {
+
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ state.buf_compute_fast.size(),
+        /*.mem_buffer =*/ state.buf_compute_fast.data(),
+        /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
+    };
+
+    struct ggml_context * ctx0   = ggml_init(ggml_params);
+    struct ggml_cgraph  * gf     = ggml_new_graph(ctx0);
+
+    prompt_encoder_result enc_res = sam_encode_prompt(model, ctx0, gf, state, nx, ny, point);
+    if (!enc_res.embd_prompt_sparse || !enc_res.embd_prompt_dense) {
+        fprintf(stderr, "%s: failed to encode prompt\n", __func__);
+        return {};
+    }
+
+    struct ggml_tensor * pe_img_dense = sam_fill_dense_pe(model, ctx0, gf, state);
+    if (!pe_img_dense) {
+        fprintf(stderr, "%s: failed to get dense positional encoding\n", __func__);
+        return {};
+    }
+
+    if (!sam_decode_mask(model, enc_res, pe_img_dense, ctx0, gf, state)) {
+         fprintf(stderr, "%s: failed to decode mask\n", __func__);
+         return {};
+    }
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+struct sam_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+
+    std::string model     = "models/sam-vit-b/ggml-model-f16.bin"; // model path
+    std::string fname_inp = "img.jpg";
+    std::string fname_out = "img.out";
+};
+
+void sam_print_usage(int argc, char ** argv, const sam_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
+    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
+    fprintf(stderr, "  -o FNAME, --out FNAME\n");
+    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
+    fprintf(stderr, "\n");
+}
+
+bool sam_params_parse(int argc, char ** argv, sam_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-i" || arg == "--inp") {
+            params.fname_inp = argv[++i];
+        } else if (arg == "-o" || arg == "--out") {
+            params.fname_out = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    const int64_t t_main_start_us = ggml_time_us();
+
+    sam_params params;
+    params.model = "models/sam-vit-b/ggml-model-f16.bin";
+
+    sam_model model;
+    sam_state state;
+    int64_t t_load_us = 0;
+
+    if (sam_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+    // load the image
+    sam_image_u8 img0;
+    if (!sam_image_load_from_file(params.fname_inp, img0)) {
+        fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, params.fname_inp.c_str());
+        return 1;
+    }
+    fprintf(stderr, "%s: loaded image '%s' (%d x %d)\n", __func__, params.fname_inp.c_str(), img0.nx, img0.ny);
+
+    // preprocess to f32
+    sam_image_f32 img1;
+    if (!sam_image_preprocess(img0, img1)) {
+        fprintf(stderr, "%s: failed to preprocess image\n", __func__);
+        return 1;
+    }
+    fprintf(stderr, "%s: preprocessed image (%d x %d)\n", __func__, img1.nx, img1.ny);
+
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!sam_model_load(params.model, model)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+    }
+
+    {
+        static size_t buf_size = 256u*1024*1024;
+
+        struct ggml_init_params ggml_params = {
+            /*.mem_size   =*/ buf_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        state.ctx = ggml_init(ggml_params);
+
+        state.embd_img = ggml_new_tensor_3d(state.ctx, GGML_TYPE_F32,
+                model.hparams.n_img_embd(), model.hparams.n_img_embd(), model.hparams.n_enc_out_chans);
+
+        state.low_res_masks = ggml_new_tensor_3d(state.ctx, GGML_TYPE_F32,
+                model.hparams.n_enc_out_chans, model.hparams.n_enc_out_chans, 3);
+
+        state.iou_predictions = ggml_new_tensor_1d(state.ctx, GGML_TYPE_F32, 3);
+    }
+
+
+    static const size_t tensor_alignment = 32;
+    {
+        state.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
+        state.allocr = ggml_allocr_new_measure(tensor_alignment);
+        struct ggml_cgraph * gf_measure = sam_encode_image(model, state, img1);
+        if (!gf_measure) {
+            fprintf(stderr, "%s: failed to encode image\n", __func__);
+            return 1;
+        }
+
+        size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
+        ggml_allocr_free(state.allocr);
+
+        // recreate allocator with exact memory requirements
+        state.buf_alloc_img_enc.resize(alloc_size);
+        state.allocr = ggml_allocr_new(state.buf_alloc_img_enc.data(), state.buf_alloc_img_enc.size(), tensor_alignment);
+
+        // compute the graph with the measured exact memory requirements from above
+        ggml_allocr_reset(state.allocr);
+
+        struct ggml_cgraph  * gf = sam_encode_image(model, state, img1);
+        if (!gf) {
+            fprintf(stderr, "%s: failed to encode image\n", __func__);
+            return 1;
+        }
+
+        ggml_allocr_alloc_graph(state.allocr, gf);
+
+        ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
+
+        print_t_f32("embd_img", state.embd_img);
+
+        ggml_allocr_free(state.allocr);
+        state.allocr = NULL;
+        state.work_buffer.clear();
+    }
+    {
+        state.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
+        state.allocr = ggml_allocr_new_measure(tensor_alignment);
+
+        // TODO: user input
+        const sam_point pt = { 414.375f, 162.796875f, };
+        // measure memory requirements for the graph
+        struct ggml_cgraph  * gf_measure = sam_build_fast_graph(model, state, img0.nx, img0.ny, pt);
+        if (!gf_measure) {
+            fprintf(stderr, "%s: failed to build fast graph to measure\n", __func__);
+            return 1;
+        }
+
+        size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
+        ggml_allocr_free(state.allocr);
+
+        // recreate allocator with exact memory requirements
+        state.buf_alloc_fast.resize(alloc_size);
+        state.allocr = ggml_allocr_new(state.buf_alloc_fast.data(), state.buf_alloc_fast.size(), tensor_alignment);
+
+        // compute the graph with the measured exact memory requirements from above
+        ggml_allocr_reset(state.allocr);
+
+        struct ggml_cgraph  * gf = sam_build_fast_graph(model, state, img0.nx, img0.ny, pt);
+        if (!gf) {
+            fprintf(stderr, "%s: failed to build fast graph\n", __func__);
+            return 1;
+        }
+
+        ggml_allocr_alloc_graph(state.allocr, gf);
+
+        ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
+
+        //print_t_f32("iou_predictions", state.iou_predictions);
+        //print_t_f32("low_res_masks", state.low_res_masks);
+        ggml_allocr_free(state.allocr);
+        state.allocr = NULL;
+    }
+
+    if (!sam_write_masks(model.hparams, img0.nx, img0.ny, state)) {
+        fprintf(stderr, "%s: failed to write masks\n", __func__);
+        return 1;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        fprintf(stderr, "\n\n");
+        fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/starcoder/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/starcoder/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..557f4e5d5fbb848ef185c8e401bfc681c7bca2c9
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/starcoder/CMakeLists.txt
@@ -0,0 +1,31 @@
+#
+# starcoder
+
+set(TEST_TARGET starcoder)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# starcoder-mmap
+
+set(TEST_TARGET starcoder-mmap)
+add_executable(${TEST_TARGET} starcoder-mmap.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# starcoder-quantize
+
+set(TEST_TARGET starcoder-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# For GPU offloading
+
+if (GGML_CUBLAS)
+    add_compile_definitions(GGML_USE_CUBLAS)
+endif()
+if (GGML_CLBLAST)
+    add_compile_definitions(GGML_USE_CLBLAST)
+endif()
+
diff --git a/stable-diffusion.cpp/ggml/examples/starcoder/README.md b/stable-diffusion.cpp/ggml/examples/starcoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d62c0d7cfbd7e42beb70fc4e6c09542c3db25e2
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/starcoder/README.md
@@ -0,0 +1,115 @@
+# 💫 StarCoder
+
+This is a C++ example running 💫 StarCoder inference using the [ggml](https://github.com/ggerganov/ggml) library.
+
+The program runs on the CPU - no video card is required.
+
+The example supports the following 💫 StarCoder models:
+
+- `bigcode/starcoder`
+- `bigcode/gpt_bigcode-santacoder` aka the smol StarCoder
+
+Sample performance on MacBook M1 Pro:
+
+TODO
+
+
+Sample output:
+
+```
+$ ./bin/starcoder -h
+usage: ./bin/starcoder [options]
+
+options:
+  -h, --help            show this help message and exit
+  -s SEED, --seed SEED  RNG seed (default: -1)
+  -t N, --threads N     number of threads to use during computation (default: 8)
+  -p PROMPT, --prompt PROMPT
+                        prompt to start generation with (default: random)
+  -n N, --n_predict N   number of tokens to predict (default: 200)
+  --top_k N             top-k sampling (default: 40)
+  --top_p N             top-p sampling (default: 0.9)
+  --temp N              temperature (default: 1.0)
+  -b N, --batch_size N  batch size for prompt processing (default: 8)
+  -m FNAME, --model FNAME
+                        model path (default: models/starcoder-117M/ggml-model.bin)
+
+$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2      
+main: seed = 1683881276
+starcoder_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin'
+starcoder_model_load: n_vocab = 49280
+starcoder_model_load: n_ctx   = 2048
+starcoder_model_load: n_embd  = 2048
+starcoder_model_load: n_head  = 16
+starcoder_model_load: n_layer = 24
+starcoder_model_load: ftype   = 3
+starcoder_model_load: ggml ctx size = 1794.90 MB
+starcoder_model_load: memory size =   768.00 MB, n_mem = 49152
+starcoder_model_load: model size  =  1026.83 MB
+main: prompt: 'def fibonnaci('
+main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7 
+
+def fibonnaci(n):
+    if n == 0:
+        return 0
+    elif n == 1:
+        return 1
+    else:
+        return fibonacci(n-1) + fibonacci(n-2)
+
+print(fibo(10))
+
+main: mem per token =  9597928 bytes
+main:     load time =   480.43 ms
+main:   sample time =    26.21 ms
+main:  predict time =  3987.95 ms / 19.36 ms per token
+main:    total time =  4580.56 ms
+```
+
+## Quick start
+```bash
+git clone https://github.com/ggerganov/ggml
+cd ggml
+
+# Install Python dependencies
+python3 -m pip install -r requirements.txt
+
+# Convert HF model to ggml
+python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder
+
+# Build ggml + examples
+mkdir build && cd build
+cmake .. && make -j4 starcoder starcoder-quantize
+
+# quantize the model
+./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3
+
+# run inference
+./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" --top_k 0 --top_p 0.95 --temp 0.2
+```
+
+
+## Downloading and converting the original models (💫 StarCoder)
+
+You can download the original model and convert it to `ggml` format using the script `convert-hf-to-ggml.py`:
+
+```
+# Convert HF model to ggml
+python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder
+```
+
+This conversion requires that you have python and Transformers installed on your computer.
+
+## Quantizing the models
+
+You can also try to quantize the `ggml` models via 4-bit integer quantization.
+
+```
+# quantize the model
+./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3
+```
+
+| Model | Original size | Quantized size | Quantization type |
+| --- | --- | --- | --- |
+| `bigcode/gpt_bigcode-santacoder` | 5396.45 MB | 1026.83 MB | 4-bit integer (q4_1) |
+| `bigcode/starcoder` | 71628.23 MB | 13596.23 MB | 4-bit integer (q4_1) |
diff --git a/stable-diffusion.cpp/ggml/examples/starcoder/convert-hf-to-ggml.py b/stable-diffusion.cpp/ggml/examples/starcoder/convert-hf-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..30af75cb9f8e5358bbf25fc8579882fc0b78dee5
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/starcoder/convert-hf-to-ggml.py
@@ -0,0 +1,208 @@
+# Convert HF models to ggml format
+#
+
+import sys
+import struct
+import json
+import torch
+import numpy as np
+import re
+import os
+import argparse
+
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+parser = argparse.ArgumentParser(description='Convert starcoder HF model to GGML')
+parser.add_argument('model_name_or_path', type=str, help='Name of model on HF hub, or local model folder')
+parser.add_argument('--outfile', type=str, default='ggml-model.bin', help='Path of GGML file to write.')
+parser.add_argument('--use_f32', action="store_true", help='Save GGML file in fp32')
+
+args = parser.parse_args()
+
+# use 16-bit or 32-bit floats
+use_f16 = not args.use_f32
+
+fname_out = args.outfile
+fname_dir = os.path.dirname(fname_out)
+if fname_dir:
+    os.makedirs(fname_dir, exist_ok=True)
+
+print("Loading model: ", args.model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True)
+hparams = config.to_dict()
+model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, config=config, torch_dtype=torch.float16 if use_f16 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True, offload_state_dict=True)
+print("Model loaded: ", args.model_name_or_path)
+
+list_vars = model.state_dict()
+
+encoder = tokenizer.vocab
+# Add added_tokens (special tokens) to the encoder
+encoder.update(tokenizer.get_added_vocab())
+print(hparams)
+
+print("Saving ggml model to: ", fname_out)
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+vocab_size = hparams["vocab_size"]
+fout.write(struct.pack("i", vocab_size))
+# fout.write(struct.pack("i", len(encoder)))
+fout.write(struct.pack("i", hparams["n_positions"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+fout.write(struct.pack("i", use_f16))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", vocab_size))
+
+counter = 0
+# sort by value
+for key in sorted(encoder, key=encoder.get):
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    counter += 1
+
+# TODO: Repeat last token until vocab_size
+while counter < vocab_size:
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    counter += 1
+# assert counter == config.vocab_size
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # rename headers to keep compatibility
+    if name == "transformer.ln_f.weight":
+        name = "model/ln_f/g"
+    elif name == "transformer.ln_f.bias":
+        name = "model/ln_f/b"
+    elif name == "transformer.wte.weight":
+        name = "model/wte"
+    elif name == "transformer.wpe.weight":
+        name = "model/wpe"
+    elif name == "lm_head.weight":
+        name = "model/lm_head"
+    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/g"
+    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/b"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/w"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/b"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/w"
+    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/b"
+    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/g"
+    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/b"
+    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/w"
+    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/b"
+    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/w"
+    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/b"
+    else:
+        print("Unrecognized variable name. %s", name)
+
+    # we don't need these
+    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape);
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0;
+    if use_f16:
+        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+
+    "model/h.*/attn/c_attn/w"
+    "model/h.*/attn/c_proj/w"
+    "model/h.*/mlp/c_fc/w"
+    "model/h.*/mlp/c_proj/w"
+    if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b":
+        print("  Duplicate K,V heads to use MHA instead of MQA")
+
+        embed_dim = hparams["n_embd"]
+        head_dim = embed_dim // hparams["n_head"]
+
+        # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+        q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
+        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
+        if len(k.shape) == 2:
+            k = np.tile(k, (hparams["n_head"], 1))
+            v = np.tile(v, (hparams["n_head"], 1))
+        elif len(k.shape) == 1:
+            k = np.tile(k, (hparams["n_head"]))
+            v = np.tile(v, (hparams["n_head"]))
+        # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+        data = np.concatenate((q, k, v), axis=0)
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/starcoder/main.cpp b/stable-diffusion.cpp/ggml/examples/starcoder/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..946210b85915adb700c76a57e44bb8d6501b3e3a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/starcoder/main.cpp
@@ -0,0 +1,927 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// default hparams (GPT-2 117M)
+// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json
+struct starcoder_hparams {
+    int32_t n_vocab = 49280;
+    int32_t n_ctx   = 2048;
+    int32_t n_embd  = 2048;
+    int32_t n_head  = 16;
+    int32_t n_layer = 24;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct starcoder_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct starcoder_model {
+    starcoder_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<starcoder_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// load the model's weights from a file
+bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+
+            // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
+        }
+
+        // Add StarChat special tokens.
+        for (std::string token : {
+                "<|system|>",
+                "<|user|>",
+                "<|assistant|>",
+                "<|end|>",
+                "<fim-prefix>",
+                "<fim-middle>",
+                "<fim-suffix>",
+                "<fim-pad>",
+                "<|end_of_turn|>"
+            }) {
+            if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
+                vocab.add_special_token(token);
+            }
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        const int head_dim = n_embd / hparams.n_head;
+        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
+        const int kv_dim   = kv_heads * head_dim;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w // TODO:
+        ctx_size += n_layer*(       (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        const int head_dim = n_embd / hparams.n_head;
+        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
+        const int kv_dim   = kv_heads * head_dim;
+
+        model.layers.resize(n_layer);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        // map by name
+        model.tensors["model/ln_f/g"] = model.ln_f_g;
+        model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+        model.tensors["model/wte"]     = model.wte;
+        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd + 2*kv_dim);
+            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim);
+
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner
+            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
+                        __func__, name.c_str(), (int) ggml_nelements(tensor), nelements);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool starcoder_eval(
+        const starcoder_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+
+    static size_t buf_size = 256u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    // use 2 scratch buffers
+    // TODO: very hacky solution - reimplement in a more elegant way
+    static size_t scr0_size = 256u*1024*1024;
+    static void * scr0 = malloc(scr0_size);
+
+    static size_t scr1_size = 256u*1024*1024;
+    static void * scr1 = malloc(scr1_size);
+
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    for (int i = 0; i < N; ++i) {
+        ((int32_t *) position->data)[i] = n_past + i;
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_attn_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                    cur);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3); //TODO: need to be tiled
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+                    cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+                            cur),
+                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+                    cur);
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+                    cur);
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    starcoder_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!starcoder_model_load(params.model, model, vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+    if (params.repeat_last_n == -1) {
+        params.repeat_last_n = model.hparams.n_ctx;
+    }
+    printf("\n");
+    printf("%s: temp           = %.3f\n", __func__, params.temp);
+    printf("%s: top_k          = %d\n",   __func__, params.top_k);
+    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
+    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
+    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    for (size_t i = 0; i < embd_inp.size(); i++) {
+        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+    }
+    printf("\n\n");
+
+    // Handle StarChat "<|end|>" and OpenCoder "<|end_of_turn>" tokens.
+    gpt_vocab::id starchat_end_token = -1;
+    {
+        const auto it = vocab.token_to_id.find("<|end|>");
+        if (it != vocab.token_to_id.end()) {
+            starchat_end_token = it->second;
+        } else {
+            const auto eot_token_id = vocab.token_to_id.find("<|end_of_turn|>");
+            if (eot_token_id != vocab.token_to_id.end()) {
+              starchat_end_token = eot_token_id->second;
+            }
+        }
+    }
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+    starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+
+            t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int   top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, params.repeat_last_n, params.repeat_penalty, rng);
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+
+            last_n_tokens.erase(last_n_tokens.begin());
+            last_n_tokens.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[k]);
+
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // check if model is santacoder
+        if (model.hparams.n_layer <= 30 && embd.back() == 49152) {
+            break;
+        }
+        // check if model is starcoder
+        else if (embd.back() == 0) { //TODO: this is only for starcoder
+            break;
+        }
+        // Handle StarChat "<|end|>" token.
+        else if (embd.back() == starchat_end_token && i >= embd_inp.size()) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/starcoder/quantize.cpp b/stable-diffusion.cpp/ggml/examples/starcoder/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3aee3f2679951e0c84e667bca7f6abe1b5dc5bf
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/starcoder/quantize.cpp
@@ -0,0 +1,184 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (GPT-2 117M)
+struct starcoder_hparams {
+    int32_t n_vocab = 49280;
+    int32_t n_ctx   = 2048;
+    int32_t n_embd  = 2048;
+    int32_t n_head  = 16;
+    int32_t n_layer = 24;
+    int32_t ftype   = 1;
+};
+
+// quantize a model
+bool starcoder_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    starcoder_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        finp.read ((char *) &n_vocab, sizeof(n_vocab));
+        fout.write((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        "model/wte",
+        "model/lm_head",
+        "model/h.*/attn/c_attn/w",
+        "model/h.*/attn/c_proj/w",
+        "model/h.*/mlp/c_fc/w",
+        "model/h.*/mlp/c_proj/w",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!starcoder_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/starcoder/starcoder-mmap.cpp b/stable-diffusion.cpp/ggml/examples/starcoder/starcoder-mmap.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a224115a1480518643575bcdb5dcfd32340dee4a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/starcoder/starcoder-mmap.cpp
@@ -0,0 +1,1127 @@
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#if !defined(_WIN32)
+// mmap
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#else
+#define NOMINMAX
+#include <Windows.h>
+#endif
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_CLBLAST
+#include "ggml-opencl.h"
+#endif
+
+// default hparams (GPT-2 117M)
+// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json
+struct starcoder_hparams {
+    int32_t n_vocab = 49280;
+    int32_t n_ctx   = 2048;
+    int32_t n_embd  = 2048;
+    int32_t n_head  = 16;
+    int32_t n_layer = 24;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct starcoder_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct llama_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    llama_buffer() = default;
+
+    void resize(size_t len) {
+#ifdef GGML_USE_METAL
+        free(addr);
+        int result = posix_memalign((void **) &addr, getpagesize(), len);
+        if (result == 0) {
+            memset(addr, 0, len);
+        }
+        else {
+            addr = NULL;
+        }
+#else
+        delete[] addr;
+        addr = new uint8_t[len];
+#endif
+        size = len;
+    }
+
+    ~llama_buffer() {
+#ifdef GGML_USE_METAL
+        free(addr);
+#else
+        delete[] addr;
+#endif
+        addr = NULL;
+    }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
+};
+
+
+struct kv_cache {
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    struct ggml_context * ctx = NULL;
+
+    //std::vector<uint8_t> buf;
+    llama_buffer buf;
+
+    int n;
+};
+
+struct starcoder_model {
+    starcoder_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<starcoder_layer> layers;
+
+    // key + value memory
+    //struct ggml_tensor * memory_k;
+    //struct ggml_tensor * memory_v;
+    struct kv_cache cache;
+
+    // model memory mapped file
+    void * mm_addr = NULL;
+    uint64_t mm_length = 0;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// From PR #613 (https://github.com/ggerganov/llama.cpp/pull/613)
+static void *mmap_file(const char *fname, uint64_t *mm_length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    HANDLE hFile = CreateFileA(fname,
+                               GENERIC_READ,
+                               FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                               NULL,
+                               OPEN_EXISTING,
+                               FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
+                               NULL);
+    if (hFile == INVALID_HANDLE_VALUE) return 0;
+    LARGE_INTEGER fileSize;
+    fileSize.QuadPart = -1;
+    GetFileSizeEx(hFile, &fileSize);
+    int64_t length = fileSize.QuadPart;
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+    CloseHandle(hFile);
+    if (!hMapping) return 0;
+    void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+    CloseHandle(hMapping);
+    if (!addr) return 0;
+#else
+    int fd = open(fname, O_RDONLY);
+    if (fd == -1) return 0;
+    int64_t length = lseek(fd, 0, SEEK_END);
+    void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
+    close(fd);
+    if (addr == MAP_FAILED) return 0;
+#endif
+    *mm_length = length;
+    return addr;
+}
+
+static void munmap_file(void * addr, size_t length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    UnmapViewOfFile(addr);
+#else
+    munmap(addr, length);
+#endif
+}
+
+// load the model's weights from a file
+bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab, int32_t n_gpu_layers) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    std::vector<char> f_buf(1024*1024);
+    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        //if (magic != 0x67676a74) {
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+
+            // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
+        }
+
+        // Add StarChat special tokens.
+        for (std::string token : {
+                "<|system|>",
+                "<|user|>",
+                "<|assistant|>",
+                "<|end|>",
+                }) {
+            if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
+                vocab.add_special_token(token);
+            }
+        }
+    }
+
+    char *mm_addr = NULL;
+    model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
+    if (model.mm_addr == NULL) {
+        fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+    mm_addr = (char *)model.mm_addr;
+    fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_layer = hparams.n_layer;
+
+
+        /*
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        const int head_dim = n_embd / hparams.n_head;
+        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
+        const int kv_dim   = kv_heads * head_dim;
+
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w // TODO:
+        ctx_size += n_layer*(       (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        */
+
+        ctx_size += (6 + 12*n_layer)*512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        const int head_dim = n_embd / hparams.n_head;
+        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
+        const int kv_dim   = kv_heads * head_dim;
+
+        model.layers.resize(n_layer);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        // map by name
+        model.tensors["model/ln_f/g"] = model.ln_f_g;
+        model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+        model.tensors["model/wte"]     = model.wte;
+        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd + 2*kv_dim);
+            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim);
+
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner
+            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.cache.buf.resize(2u*n_elements*ggml_type_size(GGML_TYPE_F16) + 2u*1024*1024);
+
+        struct ggml_init_params c_params;
+        c_params.mem_size   = model.cache.buf.size;
+        c_params.mem_buffer = model.cache.buf.addr;
+        c_params.no_alloc   = false;
+
+        model.cache.ctx = ggml_init(c_params);
+
+        if (!model.cache.ctx) {
+            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+            return false;
+        }
+
+        model.cache.k = ggml_new_tensor_1d(model.cache.ctx, GGML_TYPE_F16, n_elements);
+        model.cache.v = ggml_new_tensor_1d(model.cache.ctx, GGML_TYPE_F16, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.cache.k) + ggml_nbytes(model.cache.v);
+
+        printf("%s: kv_cache memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
+
+            auto tensor = model.tensors[name.data()];
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
+                        __func__, name.data(), (int) ggml_nelements(tensor), nelements);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            // mmap
+            size_t offset = fin.tellg();
+            size_t tensor_data_size = ggml_nbytes(tensor);
+            //offset = (offset + 31) & -32;
+            tensor->data = mm_addr + offset;
+            fin.seekg(offset + tensor_data_size);
+            total_size += tensor_data_size;
+
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                // Dont know if this is required, test models have an lm_head
+                model.lm_head->data = tensor->data;
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+        }
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+#ifdef GGML_USE_CUBLAS
+    {
+        const auto & hparams = model.hparams;
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
+            ggml_cuda_transform_tensor((uint8_t *)layer.c_attn_attn_w->data, layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
+
+            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
+            ggml_cuda_transform_tensor((uint8_t *)layer.c_attn_proj_w->data, layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+
+            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
+            ggml_cuda_transform_tensor((uint8_t *)layer.c_mlp_fc_w->data, layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+
+            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            ggml_cuda_transform_tensor((uint8_t *)layer.c_mlp_proj_w->data, layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+        }
+
+        ggml_cuda_set_scratch_size(0); // disable scratch
+
+        //if (n_gpu_layers > (int) hparams.n_layer) {
+        //    fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+        //    ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        //}
+
+        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+#elif defined(GGML_USE_CLBLAST)
+    //From koboldcpp
+    {
+        const auto & hparams = model.hparams;
+        size_t vram_total = 0;
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
+            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+        }
+        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+    #endif
+
+    return true;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool starcoder_eval(
+        const starcoder_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+
+    const int N = int(embd_inp.size());
+
+    const auto & hparams = model.hparams;
+
+    auto & cache = model.cache;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+
+    // Scratch is too small for large n_batch (256)
+    //static size_t buf_size = 256u*1024*1024;
+    static size_t buf_size = 256u*1024*1024*2;
+    static void * buf = malloc(buf_size);
+
+    // use 2 scratch buffers
+    // TODO: very hacky solution - reimplement in a more elegant way
+    static size_t scratch0_size = 256u*1024*1024*2;
+    static void * scratch0 = malloc(scratch0_size);
+
+    static size_t scratch1_size = 256u*1024*1024*2;
+    static void * scratch1 = malloc(scratch1_size);
+
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = size_t(1.1*(mem_per_token*N)); // add 10% to account for ggml object overhead
+        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+
+
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    for (int i = 0; i < N; ++i) {
+        ((int32_t *) position->data)[i] = n_past + i;
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        ggml_set_scratch(ctx0, { 0, scratch0_size, scratch0, });
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_attn_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                    cur);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, cache.k, N*n_embd, (ggml_element_size(cache.k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, cache.v, N*n_embd, (ggml_element_size(cache.v)*n_embd)*(il*n_ctx + n_past));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, cache.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(cache.k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3); //TODO: need to be tiled
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, cache.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(cache.v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, cache.v->type, n_past + N, n_embd/n_head, n_head));
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+                    cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        ggml_set_scratch(ctx0, { 0, scratch1_size, scratch1, });
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+                            cur),
+                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+                    cur);
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+                    cur);
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    ggml_set_scratch(ctx0, { 0, scratch0_size, scratch0, });
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-2-117M/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = int(time(NULL));
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    starcoder_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!starcoder_model_load(params.model, model, vocab, params.n_gpu_layers)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, params.token_test);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    for (size_t i = 0; i < embd_inp.size(); i++) {
+        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+    }
+    printf("\n\n");
+
+    // Handle StarChat "<|end|>" token.
+    gpt_vocab::id starchat_end_token = -1;
+    {
+        const auto it = vocab.token_to_id.find("<|end|>");
+        if (it != vocab.token_to_id.end()) {
+            starchat_end_token = it->second;
+        }
+    }
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    // determine the required inference memory per token:
+    size_t mem_per_token = 0;
+    printf("Calling starcoder_eval\n");
+    starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+
+            // Should input processing count towards t_predict?
+            if (i > embd_inp.size()) {
+                t_predict_us += ggml_time_us() - t_start_us;
+            }
+        }
+
+        n_past += int(embd.size());
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int   top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += int(embd.size()) - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // check if model is santacoder
+        if (model.hparams.n_layer <= 30 && embd.back() == 49152) {
+            break;
+        }
+        // check if model is starcoder
+        else if (embd.back() == 0) { //TODO: this is only for starcoder
+            break;
+        }
+        // Handle StarChat "<|end|>" token.
+        else if (embd.back() == starchat_end_token) {
+            //break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        //Shouldnt the input prompt be subracted?
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(n_past - embd_inp.size()));
+        //printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    if (model.mm_addr) {
+	    munmap_file(model.mm_addr, model.mm_length);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/stb_image.h b/stable-diffusion.cpp/ggml/examples/stb_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e807a0a6e7cdbfbbf48dff5f5d3f3693c2bc851
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/stb_image.h
@@ -0,0 +1,7987 @@
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
+
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/stable-diffusion.cpp/ggml/examples/stb_image_write.h b/stable-diffusion.cpp/ggml/examples/stb_image_write.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4b32ed1bc32ef9c962acbf47a9d10af01939e08
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/stb_image_write.h
@@ -0,0 +1,1724 @@
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
+
+   Before #including,
+
+       #define STB_IMAGE_WRITE_IMPLEMENTATION
+
+   in the file that you want to have the implementation.
+
+   Will probably not work correctly with strict-aliasing optimizations.
+
+ABOUT:
+
+   This header file is a library for writing images to C stdio or a callback.
+
+   The PNG output is not optimal; it is 20-50% larger than the file
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
+
+BUILDING:
+
+   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
+   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
+   malloc,realloc,free.
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+
+UNICODE:
+
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
+
+USAGE:
+
+   There are five functions, one for each image file format:
+
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
+     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
+
+   Each function returns 0 on failure and non-0 on success.
+
+   The functions create an image file defined by the parameters. The image
+   is a rectangle of pixels stored from left-to-right, top-to-bottom.
+   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
+   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
+   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
+   The *data pointer points to the first byte of the top-left-most pixel.
+   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
+   a row of pixels to the first byte of the next row of pixels.
+
+   PNG creates output files with the same number of components as the input.
+   The BMP format expands Y to RGB in the file format and does not
+   output alpha.
+
+   PNG supports writing rectangles of data even when the bytes storing rows of
+   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
+   by supplying the stride between the beginning of adjacent rows. The other
+   formats do not. (Thus you cannot write a native-format BMP through the BMP
+   writer, both because it is in BGR order and because it may have padding
+   at the end of the line.)
+
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
+   HDR expects linear float data. Since the format is always 32-bit rgb(e)
+   data, alpha (if provided) is discarded, and for monochrome data it is
+   replicated across all three channels.
+
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
+CREDITS:
+
+
+   Sean Barrett           -    PNG/BMP/TGA
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
+   bugfixes:
+      github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+      Andrew Kensler
+
+LICENSE
+
+  See end of file for license information.
+
+*/
+
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#define INCLUDE_STB_IMAGE_WRITE_H
+
+#include <stdlib.h>
+
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
+#ifdef __cplusplus
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
+#endif
+
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+#ifdef STBIW_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+#endif
+
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+
+#endif//INCLUDE_STB_IMAGE_WRITE_H
+
+#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
+
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+// ok
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#endif
+
+#ifndef STBIW_MALLOC
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#endif
+
+
+#ifndef STBIW_MEMMOVE
+#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#endif
+
+
+#ifndef STBIW_ASSERT
+#include <assert.h>
+#define STBIW_ASSERT(x) assert(x)
+#endif
+
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+static int stbi__flip_vertically_on_write = 0;
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+   unsigned char buffer[64];
+   int buf_used;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = stbiw__fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
+typedef unsigned int stbiw_uint32;
+typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
+
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
+{
+   while (*fmt) {
+      switch (*fmt++) {
+         case ' ': break;
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
+         default:
+            STBIW_ASSERT(0);
+            return;
+      }
+   }
+}
+
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__write_flush(stbi__write_context *s)
+{
+   if (s->buf_used) {
+      s->func(s->context, &s->buffer, s->buf_used);
+      s->buf_used = 0;
+   }
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write1(stbi__write_context *s, unsigned char a)
+{
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   s->buffer[s->buf_used++] = a;
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
+{
+   int n;
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   n = s->buf_used;
+   s->buf_used = n+3;
+   s->buffer[n+0] = a;
+   s->buffer[n+1] = b;
+   s->buffer[n+2] = c;
+}
+
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
+{
+   unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      stbiw__write1(s, d[comp - 1]);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            stbiw__write1(s, d[0]);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      stbiw__write1(s, d[comp - 1]);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
+   stbiw_uint32 zero = 0;
+   int i,j, j_end;
+
+   if (y <= 0)
+      return;
+
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
+   if (vdir < 0) {
+      j_end = -1; j = y-1;
+   } else {
+      j_end =  y; j = 0;
+   }
+
+   for (; j != j_end; j += vdir) {
+      for (i=0; i < x; ++i) {
+         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+      }
+      stbiw__write_flush(s);
+      s->func(s->context, &zero, scanline_pad);
+   }
+}
+
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+{
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
+      va_list v;
+      va_start(v, fmt);
+      stbiw__writefv(s, fmt, v);
+      va_end(v);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
+   }
+}
+
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
+{
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
+}
+
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
+{
+   int has_alpha = (comp == 2 || comp == 4);
+   int colorbytes = has_alpha ? comp-1 : comp;
+   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               stbiw__write1(s, header);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               stbiw__write1(s, header);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+      stbiw__write_flush(s);
+   }
+   return 1;
+}
+
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR writer
+// by Baldur Karlsson
+
+#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+{
+   int exponent;
+   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+
+   if (maxcomp < 1e-32f) {
+      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+   } else {
+      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+
+      rgbe[0] = (unsigned char)(linear[0] * normalize);
+      rgbe[1] = (unsigned char)(linear[1] * normalize);
+      rgbe[2] = (unsigned char)(linear[2] * normalize);
+      rgbe[3] = (unsigned char)(exponent + 128);
+   }
+}
+
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
+   STBIW_ASSERT(length+128 <= 255);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
+}
+
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length);
+   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
+}
+
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+{
+   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
+   unsigned char rgbe[4];
+   float linear[3];
+   int x;
+
+   scanlineheader[2] = (width&0xff00)>>8;
+   scanlineheader[3] = (width&0x00ff);
+
+   /* skip RLE for images too small or large */
+   if (width < 8 || width >= 32768) {
+      for (x=0; x < width; x++) {
+         switch (ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         s->func(s->context, rgbe, 4);
+      }
+   } else {
+      int c,r;
+      /* encode into scratch buffer */
+      for (x=0; x < width; x++) {
+         switch(ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         scratch[x + width*0] = rgbe[0];
+         scratch[x + width*1] = rgbe[1];
+         scratch[x + width*2] = rgbe[2];
+         scratch[x + width*3] = rgbe[3];
+      }
+
+      s->func(s->context, scanlineheader, 4);
+
+      /* RLE each component separately */
+      for (c=0; c < 4; c++) {
+         unsigned char *comp = &scratch[width*c];
+
+         x = 0;
+         while (x < width) {
+            // find first run
+            r = x;
+            while (r+2 < width) {
+               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
+                  break;
+               ++r;
+            }
+            if (r+2 >= width)
+               r = width;
+            // dump up to first run
+            while (x < r) {
+               int len = r-x;
+               if (len > 128) len = 128;
+               stbiw__write_dump_data(s, len, &comp[x]);
+               x += len;
+            }
+            // if there's a run, output it
+            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
+               // find next byte after run
+               while (r < width && comp[r] == comp[x])
+                  ++r;
+               // output run up to r
+               while (x < r) {
+                  int len = r-x;
+                  if (len > 127) len = 127;
+                  stbiw__write_run_data(s, len, comp[x]);
+                  x += len;
+               }
+            }
+         }
+      }
+   }
+}
+
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
+{
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
+      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef __STDC_LIB_EXT1__
+      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
+      for(i=0; i < y; i++)
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
+      STBIW_FREE(scratch);
+      return 1;
+   }
+}
+
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
+#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
+
+#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
+#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
+
+#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
+
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      stbiw__sbm(*arr) = m;
+   }
+   return *arr;
+}
+
+static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+
+static int stbiw__zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+
+static unsigned int stbiw__zhash(unsigned char *data)
+{
+   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+
+#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+// default huffman tables
+#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
+#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
+#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+
+#define stbiw__ZHASH   16384
+
+#endif // STBIW_ZLIB_COMPRESS
+
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
+   if (hash_table == NULL)
+      return NULL;
+   if (quality < 5) quality = 5;
+
+   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
+   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
+   stbiw__zlib_add(1,1);  // BFINAL = 1
+   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      hash_table[i] = NULL;
+
+   i=0;
+   while (i < data_len-3) {
+      // hash next 3 bytes of data to be compressed
+      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = stbiw__sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) { // if entry lies within window
+            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) { best=d; bestloc=hlist[j]; }
+         }
+      }
+      // when hash table entry is too long, delete half the entries
+      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         stbiw__sbn(hash_table[h]) = quality;
+      }
+      stbiw__sbpush(hash_table[h],data+i);
+
+      if (bestloc) {
+         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
+         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
+         hlist = hash_table[h];
+         n = stbiw__sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) { // if next match is better, bail on current match
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (bestloc) {
+         int d = (int) (data+i - bestloc); // distance back
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         stbiw__zlib_huff(j+257);
+         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
+         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         stbiw__zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   // write out final bytes
+   for (;i < data_len; ++i)
+      stbiw__zlib_huffb(data[i]);
+   stbiw__zlib_huff(256); // end of block
+   // pad with 0 bits to byte boundary
+   while (bitcount)
+      stbiw__zlib_add(0,1);
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
+   {
+      // compute adler32 on input
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
+         s1 %= 65521; s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = stbiw__sbn(out);
+   // make returned pointer freeable
+   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+   return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
+}
+
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
+{
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
+   unsigned int crc = ~0u;
+   int i;
+   for (i=0; i < len; ++i)
+      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+   return ~crc;
+#endif
+}
+
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
+#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
+#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+
+static void stbiw__wpcrc(unsigned char **data, int len)
+{
+   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
+   stbiw__wp32(*data, crc);
+}
+
+static unsigned char stbiw__paeth(int a, int b, int c)
+{
+   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+
+   // first loop isn't optimized since it's just one pixel
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+{
+   int force_filter = stbi_write_force_png_filter;
+   int ctype[5] = { -1, 0, 4, 2, 6 };
+   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
+   unsigned char *out,*o, *filt, *zlib;
+   signed char *line_buffer;
+   int j,zlen;
+
+   if (stride_bytes == 0)
+      stride_bytes = x * n;
+
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
+   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
+   for (j=0; j < y; ++j) {
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
+               est += abs((signed char) line_buffer[i]);
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
+         }
+      }
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
+      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+   }
+   STBIW_FREE(line_buffer);
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
+   STBIW_FREE(filt);
+   if (!zlib) return 0;
+
+   // each tag requires 12 bytes of overhead
+   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
+   if (!out) return 0;
+   *out_len = 8 + 12+13 + 12+zlen + 12;
+
+   o=out;
+   STBIW_MEMMOVE(o,sig,8); o+= 8;
+   stbiw__wp32(o, 13); // header length
+   stbiw__wptag(o, "IHDR");
+   stbiw__wp32(o, x);
+   stbiw__wp32(o, y);
+   *o++ = 8;
+   *o++ = STBIW_UCHAR(ctype[n]);
+   *o++ = 0;
+   *o++ = 0;
+   *o++ = 0;
+   stbiw__wpcrc(&o,13);
+
+   stbiw__wp32(o, zlen);
+   stbiw__wptag(o, "IDAT");
+   STBIW_MEMMOVE(o, zlib, zlen);
+   o += zlen;
+   STBIW_FREE(zlib);
+   stbiw__wpcrc(&o, zlen);
+
+   stbiw__wp32(o,0);
+   stbiw__wptag(o, "IEND");
+   stbiw__wpcrc(&o,0);
+
+   STBIW_ASSERT(o == out + *out_len);
+
+   return out;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   FILE *f;
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+
+   f = stbiw__fopen(filename, "wb");
+   if (!f) { STBIW_FREE(png); return 0; }
+   fwrite(png, 1, len, f);
+   fclose(f);
+   STBIW_FREE(png);
+   return 1;
+}
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, j, n, diff, end0pos, x, y;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
+                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(y = 0, j=0; y < 8; ++y) {
+      for(x = 0; x < 8; ++x,++j) {
+         float v;
+         i = y*du_stride+x;
+         v = CDU[i]*fdtbl[j];
+         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+      }
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k, subsample;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   subsample = quality <= 90 ? 1 : 0;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      const unsigned char *dataR = (const unsigned char *)data;
+      const unsigned char *dataG = dataR + ofsG;
+      const unsigned char *dataB = dataR + ofsB;
+      int x, y, pos;
+      if(subsample) {
+         for(y = 0; y < height; y += 16) {
+            for(x = 0; x < width; x += 16) {
+               float Y[256], U[256], V[256];
+               for(row = y, pos = 0; row < y+16; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+16; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+
+               // subsample U,V
+               {
+                  float subU[64], subV[64];
+                  int yy, xx;
+                  for(yy = 0, pos = 0; yy < 8; ++yy) {
+                     for(xx = 0; xx < 8; ++xx, ++pos) {
+                        int j = yy*32+xx*2;
+                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
+                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
+                     }
+                  }
+                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+               }
+            }
+         }
+      } else {
+         for(y = 0; y < height; y += 8) {
+            for(x = 0; x < width; x += 8) {
+               float Y[64], U[64], V[64];
+               for(row = y, pos = 0; row < y+8; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+8; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
+               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+            }
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
+
+/* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
+      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
+      1.13
+      1.12
+      1.11  (2019-08-11)
+
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
+      0.98 (2015-04-08)
+             added STBIW_MALLOC, STBIW_ASSERT etc
+      0.97 (2015-01-18)
+             fixed HDR asserts, rewrote HDR rle logic
+      0.96 (2015-01-17)
+             add HDR output
+             fix monochrome BMP
+      0.95 (2014-08-17)
+             add monochrome TGA output
+      0.94 (2014-05-31)
+             rename private functions to avoid conflicts with stb_image.h
+      0.93 (2014-05-27)
+             warning fixes
+      0.92 (2010-08-01)
+             casts to unsigned char to fix warnings
+      0.91 (2010-07-17)
+             first public release
+      0.90   first internal release
+*/
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/stable-diffusion.cpp/ggml/examples/whisper/CMakeLists.txt b/stable-diffusion.cpp/ggml/examples/whisper/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a58251d781addf41a8dc887f0259b0f187bc3652
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/whisper/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# whisper
+
+add_library(whisper-cpp STATIC
+    whisper.cpp
+    )
+
+target_link_libraries(whisper-cpp PRIVATE
+    ggml
+    )
+
+set(TEST_TARGET whisper)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE whisper-cpp common)
+target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+#
+# whisper-quantize
+
+set(TEST_TARGET whisper-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/stable-diffusion.cpp/ggml/examples/whisper/README.md b/stable-diffusion.cpp/ggml/examples/whisper/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a2e972729636c32e9190e089609dc1c7ec30dc73
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/whisper/README.md
@@ -0,0 +1,29 @@
+# whisper
+
+Port of [OpenAI's Whisper](https://github.com/openai/whisper) ASR model in C/C++ using
+[ggml](https://github.com/ggerganov/ggml)
+
+## More info
+
+Checkout https://github.com/ggerganov/whisper.cpp
+
+## Memory usage
+
+| Model  | Disk   | Mem     |
+| ---    | ---    | ---     |
+| tiny   |  75 MB | ~280 MB |
+| base   | 142 MB | ~430 MB |
+| small  | 466 MB | ~1.0 GB |
+| medium | 1.5 GB | ~2.6 GB |
+| large  | 2.9 GB | ~4.7 GB |
+
+## ggml format
+
+The original models are converted to a custom binary format. This allows to pack everything needed into a single file:
+
+- model parameters
+- mel filters
+- vocabulary
+- weights
+
+For more details, see the conversion script [convert-pt-to-ggml.py](convert-pt-to-ggml.py)
diff --git a/stable-diffusion.cpp/ggml/examples/whisper/convert-pt-to-ggml.py b/stable-diffusion.cpp/ggml/examples/whisper/convert-pt-to-ggml.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aa134b53f7d05c1f9d2be60759f28b14e87fdc6
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/whisper/convert-pt-to-ggml.py
@@ -0,0 +1,342 @@
+# Convert Whisper transformer model from PyTorch to ggml format
+#
+# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#
+# You need to clone the original repo in ~/path/to/repo/whisper/
+#
+#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+#
+# It is used to various assets needed by the algorithm:
+#
+#  - tokenizer
+#  - mel filters
+#
+# Also, you need to have the original models in ~/.cache/whisper/
+# See the original repo for more details.
+#
+# This script loads the specified model and whisper assets and saves them in ggml format.
+# The output is a single binary file containing the following information:
+#
+#  - hparams
+#  - mel filters
+#  - tokenizer vocab
+#  - model variables
+#
+# For each variable, write the following:
+#
+#  - Number of dimensions (int)
+#  - Name length (int)
+#  - Dimensions (int[n_dims])
+#  - Name (char[name_length])
+#  - Data (float[n_dims])
+#
+
+import io
+import os
+import sys
+import struct
+import json
+import code
+import torch
+import numpy as np
+import base64
+from pathlib import Path
+#from transformers import GPTJForCausalLM
+#from transformers import GPT2TokenizerFast
+
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
+#LANGUAGES = {
+#    "en": "english",
+#    "zh": "chinese",
+#    "de": "german",
+#    "es": "spanish",
+#    "ru": "russian",
+#    "ko": "korean",
+#    "fr": "french",
+#    "ja": "japanese",
+#    "pt": "portuguese",
+#    "tr": "turkish",
+#    "pl": "polish",
+#    "ca": "catalan",
+#    "nl": "dutch",
+#    "ar": "arabic",
+#    "sv": "swedish",
+#    "it": "italian",
+#    "id": "indonesian",
+#    "hi": "hindi",
+#    "fi": "finnish",
+#    "vi": "vietnamese",
+#    "iw": "hebrew",
+#    "uk": "ukrainian",
+#    "el": "greek",
+#    "ms": "malay",
+#    "cs": "czech",
+#    "ro": "romanian",
+#    "da": "danish",
+#    "hu": "hungarian",
+#    "ta": "tamil",
+#    "no": "norwegian",
+#    "th": "thai",
+#    "ur": "urdu",
+#    "hr": "croatian",
+#    "bg": "bulgarian",
+#    "lt": "lithuanian",
+#    "la": "latin",
+#    "mi": "maori",
+#    "ml": "malayalam",
+#    "cy": "welsh",
+#    "sk": "slovak",
+#    "te": "telugu",
+#    "fa": "persian",
+#    "lv": "latvian",
+#    "bn": "bengali",
+#    "sr": "serbian",
+#    "az": "azerbaijani",
+#    "sl": "slovenian",
+#    "kn": "kannada",
+#    "et": "estonian",
+#    "mk": "macedonian",
+#    "br": "breton",
+#    "eu": "basque",
+#    "is": "icelandic",
+#    "hy": "armenian",
+#    "ne": "nepali",
+#    "mn": "mongolian",
+#    "bs": "bosnian",
+#    "kk": "kazakh",
+#    "sq": "albanian",
+#    "sw": "swahili",
+#    "gl": "galician",
+#    "mr": "marathi",
+#    "pa": "punjabi",
+#    "si": "sinhala",
+#    "km": "khmer",
+#    "sn": "shona",
+#    "yo": "yoruba",
+#    "so": "somali",
+#    "af": "afrikaans",
+#    "oc": "occitan",
+#    "ka": "georgian",
+#    "be": "belarusian",
+#    "tg": "tajik",
+#    "sd": "sindhi",
+#    "gu": "gujarati",
+#    "am": "amharic",
+#    "yi": "yiddish",
+#    "lo": "lao",
+#    "uz": "uzbek",
+#    "fo": "faroese",
+#    "ht": "haitian creole",
+#    "ps": "pashto",
+#    "tk": "turkmen",
+#    "nn": "nynorsk",
+#    "mt": "maltese",
+#    "sa": "sanskrit",
+#    "lb": "luxembourgish",
+#    "my": "myanmar",
+#    "bo": "tibetan",
+#    "tl": "tagalog",
+#    "mg": "malagasy",
+#    "as": "assamese",
+#    "tt": "tatar",
+#    "haw": "hawaiian",
+#    "ln": "lingala",
+#    "ha": "hausa",
+#    "ba": "bashkir",
+#    "jw": "javanese",
+#    "su": "sundanese",
+#}
+
+## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
+#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
+#    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
+#    tokenizer = GPT2TokenizerFast.from_pretrained(path)
+#
+#    specials = [
+#        "<|startoftranscript|>",
+#        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+#        "<|translate|>",
+#        "<|transcribe|>",
+#        "<|startoflm|>",
+#        "<|startofprev|>",
+#        "<|nocaptions|>",
+#        "<|notimestamps|>",
+#    ]
+#
+#    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
+#    return tokenizer
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+if len(sys.argv) < 4:
+    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
+    sys.exit(1)
+
+fname_inp   = Path(sys.argv[1])
+dir_whisper = Path(sys.argv[2])
+dir_out     = Path(sys.argv[3])
+
+# try to load PyTorch binary data
+try:
+    model_bytes = open(fname_inp, "rb").read()
+    with io.BytesIO(model_bytes) as fp:
+        checkpoint = torch.load(fp, map_location="cpu")
+except Exception:
+    print("Error: failed to load PyTorch model file:" , fname_inp)
+    sys.exit(1)
+
+hparams = checkpoint["dims"]
+print("hparams:", hparams)
+
+list_vars = checkpoint["model_state_dict"]
+
+#print(list_vars['encoder.positional_embedding'])
+#print(list_vars['encoder.conv1.weight'])
+#print(list_vars['encoder.conv1.weight'].shape)
+
+# load mel filters
+n_mels = hparams["n_mels"]
+with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
+    filters = torch.from_numpy(f[f"mel_{n_mels}"])
+    #print (filters)
+
+#code.interact(local=locals())
+
+# load tokenizer
+# for backwards compatibility, also check for older hf_transformers format tokenizer files
+# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
+# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
+multilingual = hparams["n_vocab"] == 51865
+tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
+tokenizer_type = "tiktoken"
+if not tokenizer.is_file():
+    tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual" or "gpt2") / "vocab.json"
+    tokenizer_type = "hf_transformers"
+    if not tokenizer.is_file():
+        print("Error: failed to find either tiktoken or hf_transformers tokenizer file:", tokenizer)
+        sys.exit(1)
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+if tokenizer_type == "tiktoken":
+    with open(tokenizer, "rb") as f:
+        contents = f.read()
+        tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
+elif tokenizer_type == "hf_transformers":
+    with open(tokenizer, "r", encoding="utf8") as f:
+        _tokens_raw = json.load(f)
+        if '<|endoftext|>' in _tokens_raw:
+            # ensures exact same model as tokenizer_type == tiktoken
+            # details: https://github.com/ggerganov/whisper.cpp/pull/725
+            del _tokens_raw['<|endoftext|>']
+        tokens = {bytes([byte_decoder[c] for c in token]): int(idx) for token, idx in _tokens_raw.items()}
+
+# output in the same directory as the model
+fname_out = dir_out / "ggml-model.bin"
+
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 4:
+    use_f16 = False
+    fname_out = dir_out / "ggml-model-f32.bin"
+
+fout = fname_out.open("wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["n_vocab"]))
+fout.write(struct.pack("i", hparams["n_audio_ctx"]))
+fout.write(struct.pack("i", hparams["n_audio_state"]))
+fout.write(struct.pack("i", hparams["n_audio_head"]))
+fout.write(struct.pack("i", hparams["n_audio_layer"]))
+fout.write(struct.pack("i", hparams["n_text_ctx"]))
+fout.write(struct.pack("i", hparams["n_text_state"]))
+fout.write(struct.pack("i", hparams["n_text_head"]))
+fout.write(struct.pack("i", hparams["n_text_layer"]))
+fout.write(struct.pack("i", hparams["n_mels"]))
+fout.write(struct.pack("i", use_f16))
+
+# write mel filters
+fout.write(struct.pack("i", filters.shape[0]))
+fout.write(struct.pack("i", filters.shape[1]))
+for i in range(filters.shape[0]):
+    for j in range(filters.shape[1]):
+        fout.write(struct.pack("f", filters[i][j]))
+
+# write tokenizer
+fout.write(struct.pack("i", len(tokens)))
+
+for key in tokens:
+    fout.write(struct.pack("i", len(key)))
+    fout.write(key)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " , name ,  " with shape: ", data.shape)
+
+    # reshape conv bias from [n] to [n, 1]
+    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
+        data = data.reshape(data.shape[0], 1)
+        print(f"  Reshaped variable: {name} to shape: ", data.shape)
+
+    n_dims = len(data.shape)
+
+    # looks like the whisper models are in f16 by default
+    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 1
+    if use_f16:
+        if n_dims < 2 or \
+                name == "encoder.conv1.bias"   or \
+                name == "encoder.conv2.bias"   or \
+                name == "encoder.positional_embedding" or \
+                name == "decoder.positional_embedding":
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+    else:
+        data = data.astype(np.float32)
+        ftype = 0
+
+    #if name.startswith("encoder"):
+    #    if name.endswith("mlp.0.weight") or \
+    #       name.endswith("mlp.2.weight"):
+    #        print("  Transposing")
+    #        data = data.transpose()
+
+    # header
+    str_ = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str_)
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " , fname_out)
+print("")
diff --git a/stable-diffusion.cpp/ggml/examples/whisper/main.cpp b/stable-diffusion.cpp/ggml/examples/whisper/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60c1cca756a683de1321e72a38838957e464b629
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/whisper/main.cpp
@@ -0,0 +1,1024 @@
+#include "common.h"
+
+#include "whisper.h"
+
+#include <cmath>
+#include <fstream>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <vector>
+#include <cstring>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
+// Lowest is red, middle is yellow, highest is green.
+const std::vector<std::string> k_colors = {
+    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
+    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
+};
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma = false) {
+    int64_t msec = t * 10;
+    int64_t hr = msec / (1000 * 60 * 60);
+    msec = msec - hr * (1000 * 60 * 60);
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
+
+    return std::string(buf);
+}
+
+int timestamp_to_sample(int64_t t, int n_samples) {
+    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
+}
+
+// helper function to replace substrings
+void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    for (size_t pos = 0; ; pos += replace.length()) {
+        pos = s.find(search, pos);
+        if (pos == std::string::npos) break;
+        s.erase(pos, search.length());
+        s.insert(pos, replace);
+    }
+}
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_processors =  1;
+    int32_t offset_t_ms  =  0;
+    int32_t offset_n     =  0;
+    int32_t duration_ms  =  0;
+    int32_t progress_step =  5;
+    int32_t max_context  = -1;
+    int32_t max_len      =  0;
+    int32_t best_of      =  2;
+    int32_t beam_size    = -1;
+
+    float word_thold    =  0.01f;
+    float entropy_thold =  2.40f;
+    float logprob_thold = -1.00f;
+
+    bool speed_up        = false;
+    bool debug_mode      = false;
+    bool translate       = false;
+    bool detect_language = false;
+    bool diarize         = false;
+    bool tinydiarize     = false;
+    bool split_on_word   = false;
+    bool no_fallback     = false;
+    bool output_txt      = false;
+    bool output_vtt      = false;
+    bool output_srt      = false;
+    bool output_wts      = false;
+    bool output_csv      = false;
+    bool output_jsn      = false;
+    bool output_lrc      = false;
+    bool print_special   = false;
+    bool print_colors    = false;
+    bool print_progress  = false;
+    bool no_timestamps   = false;
+    bool log_score       = false;
+
+    std::string language  = "en";
+    std::string prompt;
+    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
+    std::string model     = "models/ggml-base.en.bin";
+
+    // [TDRZ] speaker turn string
+    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
+
+    std::string openvino_encode_device = "CPU";
+
+    std::vector<std::string> fname_inp = {};
+    std::vector<std::string> fname_out = {};
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-"){
+            params.fname_inp.push_back(arg);
+            continue;
+        }
+
+        if (arg[0] != '-') {
+            params.fname_inp.push_back(arg);
+            continue;
+        }
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+        else if (arg == "-t"    || arg == "--threads")         { params.n_threads       = std::stoi(argv[++i]); }
+        else if (arg == "-p"    || arg == "--processors")      { params.n_processors    = std::stoi(argv[++i]); }
+        else if (arg == "-ot"   || arg == "--offset-t")        { params.offset_t_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-on"   || arg == "--offset-n")        { params.offset_n        = std::stoi(argv[++i]); }
+        else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
+        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
+        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
+        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
+        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
+        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
+        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
+        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
+        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
+        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
+        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
+        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
+        else if (arg == "-sow"  || arg == "--split-on-word")   { params.split_on_word   = true; }
+        else if (arg == "-nf"   || arg == "--no-fallback")     { params.no_fallback     = true; }
+        else if (arg == "-otxt" || arg == "--output-txt")      { params.output_txt      = true; }
+        else if (arg == "-ovtt" || arg == "--output-vtt")      { params.output_vtt      = true; }
+        else if (arg == "-osrt" || arg == "--output-srt")      { params.output_srt      = true; }
+        else if (arg == "-owts" || arg == "--output-words")    { params.output_wts      = true; }
+        else if (arg == "-olrc" || arg == "--output-lrc")      { params.output_lrc      = true; }
+        else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = argv[++i]; }
+        else if (arg == "-ocsv" || arg == "--output-csv")      { params.output_csv      = true; }
+        else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
+        else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(argv[++i]); }
+        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
+        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
+        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
+        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
+        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
+        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
+        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
+        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
+        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
+        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
+        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score = true; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
+    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
+    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
+    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
+    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
+    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
+    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
+    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
+    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -olrc,     --output-lrc        [%-7s] output result in a lrc file\n",                    params.output_lrc ? "true" : "false");
+    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
+    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
+    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
+    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
+    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
+    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
+    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
+    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
+    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
+    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
+    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
+    fprintf(stderr, "\n");
+}
+
+struct whisper_print_user_data {
+    const whisper_params * params;
+
+    const std::vector<std::vector<float>> * pcmf32s;
+    int progress_prev;
+};
+
+std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
+    std::string speaker = "";
+    const int64_t n_samples = pcmf32s[0].size();
+
+    const int64_t is0 = timestamp_to_sample(t0, n_samples);
+    const int64_t is1 = timestamp_to_sample(t1, n_samples);
+
+    double energy0 = 0.0f;
+    double energy1 = 0.0f;
+
+    for (int64_t j = is0; j < is1; j++) {
+        energy0 += fabs(pcmf32s[0][j]);
+        energy1 += fabs(pcmf32s[1][j]);
+    }
+
+    if (energy0 > 1.1*energy1) {
+        speaker = "0";
+    } else if (energy1 > 1.1*energy0) {
+        speaker = "1";
+    } else {
+        speaker = "?";
+    }
+
+    //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
+
+    if (!id_only) {
+        speaker.insert(0, "(speaker ");
+        speaker.append(")");
+    }
+
+    return speaker;
+}
+void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
+    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
+    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
+    if (progress >= *progress_prev + progress_step) {
+        *progress_prev += progress_step;
+        fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
+    }
+}
+
+void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
+    const auto & params  = *((whisper_print_user_data *) user_data)->params;
+    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+
+    std::string speaker = "";
+
+    int64_t t0 = 0;
+    int64_t t1 = 0;
+
+    // print the last n_new segments
+    const int s0 = n_segments - n_new;
+
+    if (s0 == 0) {
+        printf("\n");
+    }
+
+    for (int i = s0; i < n_segments; i++) {
+        if (!params.no_timestamps || params.diarize) {
+            t0 = whisper_full_get_segment_t0(ctx, i);
+            t1 = whisper_full_get_segment_t1(ctx, i);
+        }
+
+        if (!params.no_timestamps) {
+            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+        }
+
+        if (params.diarize && pcmf32s.size() == 2) {
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        if (params.print_colors) {
+            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                if (params.print_special == false) {
+                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                    if (id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+                }
+
+                const char * text = whisper_full_get_token_text(ctx, i, j);
+                const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+            }
+        } else {
+            const char * text = whisper_full_get_segment_text(ctx, i);
+
+            printf("%s%s", speaker.c_str(), text);
+        }
+
+        if (params.tinydiarize) {
+            if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
+                printf("%s", params.tdrz_speaker_turn.c_str());
+            }
+        }
+
+        // with timestamps or speakers: each segment on new line
+        if (!params.no_timestamps || params.diarize) {
+            printf("\n");
+        }
+
+        fflush(stdout);
+    }
+}
+
+bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        std::string speaker = "";
+
+        if (params.diarize && pcmf32s.size() == 2)
+        {
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        fout << speaker << text << "\n";
+    }
+
+    return true;
+}
+
+bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    fout << "WEBVTT\n\n";
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+        std::string speaker = "";
+
+        if (params.diarize && pcmf32s.size() == 2)
+        {
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
+            speaker.insert(0, "<v Speaker");
+            speaker.append(">");
+        }
+
+        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
+        fout << speaker << text << "\n\n";
+    }
+
+    return true;
+}
+
+bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+        std::string speaker = "";
+
+        if (params.diarize && pcmf32s.size() == 2)
+        {
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        fout << i + 1 + params.offset_n << "\n";
+        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
+        fout << speaker << text << "\n\n";
+    }
+
+    return true;
+}
+
+char *escape_double_quotes_and_backslashes(const char *str) {
+    if (str == NULL) {
+        return NULL;
+    }
+
+    size_t escaped_length = strlen(str) + 1;
+
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"' || str[i] == '\\') {
+            escaped_length++;
+        }
+    }
+
+    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
+    if (escaped == NULL) {
+        return NULL;
+    }
+
+    size_t pos = 0;
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"' || str[i] == '\\') {
+            escaped[pos++] = '\\';
+        }
+        escaped[pos++] = str[i];
+    }
+
+    // no need to set zero due to calloc() being used prior
+
+    return escaped;
+}
+
+bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    fout << "start,end,";
+    if (params.diarize && pcmf32s.size() == 2)
+    {
+        fout << "speaker,";
+    }
+    fout << "text\n";
+
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+        char * text_escaped = escape_double_quotes_and_backslashes(text);
+
+        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
+        fout << 10 * t0 << "," << 10 * t1 << ",";
+        if (params.diarize && pcmf32s.size() == 2)
+        {
+            fout << estimate_diarization_speaker(pcmf32s, t0, t1, true) << ",";
+        }
+        fout << "\"" << text_escaped << "\"\n";
+    }
+
+    return true;
+}
+
+bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
+    std::ofstream fout(fname);
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    // fprintf(stderr,"segments: %d\n",n_segments);
+    for (int i = 0; i < n_segments; ++i) {
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        // fprintf(stderr,"tokens: %d\n",n_tokens);
+        for (int j = 0; j < n_tokens; j++) {
+            auto token = whisper_full_get_token_text(ctx, i, j);
+            auto probability = whisper_full_get_token_p(ctx, i, j);
+            fout << token << '\t' << probability << std::endl;
+            // fprintf(stderr,"token: %s %f\n",token,probability);
+	    }
+    }
+    return true;
+}
+
+bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+    int indent = 0;
+
+    auto doindent = [&]() {
+        for (int i = 0; i < indent; i++) fout << "\t";
+    };
+
+    auto start_arr = [&](const char *name) {
+        doindent();
+        fout << "\"" << name << "\": [\n";
+        indent++;
+    };
+
+    auto end_arr = [&](bool end) {
+        indent--;
+        doindent();
+        fout << (end ? "]\n" : "},\n");
+    };
+
+    auto start_obj = [&](const char *name) {
+        doindent();
+        if (name) {
+            fout << "\"" << name << "\": {\n";
+        } else {
+            fout << "{\n";
+        }
+        indent++;
+    };
+
+    auto end_obj = [&](bool end) {
+        indent--;
+        doindent();
+        fout << (end ? "}\n" : "},\n");
+    };
+
+    auto start_value = [&](const char *name) {
+        doindent();
+        fout << "\"" << name << "\": ";
+    };
+
+    auto value_s = [&](const char *name, const char *val, bool end) {
+        start_value(name);
+        char * val_escaped = escape_double_quotes_and_backslashes(val);
+        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
+        free(val_escaped);
+    };
+
+    auto end_value = [&](bool end) {
+        fout << (end ? "\n" : ",\n");
+    };
+
+    auto value_i = [&](const char *name, const int64_t val, bool end) {
+        start_value(name);
+        fout << val;
+        end_value(end);
+    };
+
+    auto value_b = [&](const char *name, const bool val, bool end) {
+        start_value(name);
+        fout << (val ? "true" : "false");
+        end_value(end);
+    };
+
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+    start_obj(nullptr);
+        value_s("systeminfo", whisper_print_system_info(), false);
+        start_obj("model");
+            value_s("type", whisper_model_type_readable(ctx), false);
+            value_b("multilingual", whisper_is_multilingual(ctx), false);
+            value_i("vocab", whisper_model_n_vocab(ctx), false);
+            start_obj("audio");
+                value_i("ctx", whisper_model_n_audio_ctx(ctx), false);
+                value_i("state", whisper_model_n_audio_state(ctx), false);
+                value_i("head", whisper_model_n_audio_head(ctx), false);
+                value_i("layer", whisper_model_n_audio_layer(ctx), true);
+            end_obj(false);
+            start_obj("text");
+                value_i("ctx", whisper_model_n_text_ctx(ctx), false);
+                value_i("state", whisper_model_n_text_state(ctx), false);
+                value_i("head", whisper_model_n_text_head(ctx), false);
+                value_i("layer", whisper_model_n_text_layer(ctx), true);
+            end_obj(false);
+            value_i("mels", whisper_model_n_mels(ctx), false);
+            value_i("ftype", whisper_model_ftype(ctx), true);
+        end_obj(false);
+        start_obj("params");
+            value_s("model", params.model.c_str(), false);
+            value_s("language", params.language.c_str(), false);
+            value_b("translate", params.translate, true);
+        end_obj(false);
+        start_obj("result");
+            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
+        end_obj(false);
+        start_arr("transcription");
+
+            const int n_segments = whisper_full_n_segments(ctx);
+            for (int i = 0; i < n_segments; ++i) {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+
+                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                start_obj(nullptr);
+                    start_obj("timestamps");
+                        value_s("from", to_timestamp(t0, true).c_str(), false);
+                        value_s("to", to_timestamp(t1, true).c_str(), true);
+                    end_obj(false);
+                    start_obj("offsets");
+                        value_i("from", t0 * 10, false);
+                        value_i("to", t1 * 10, true);
+                    end_obj(false);
+                    value_s("text", text, !params.diarize && !params.tinydiarize);
+
+                    if (params.diarize && pcmf32s.size() == 2) {
+                        value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
+                    }
+
+                    if (params.tinydiarize) {
+                        value_b("speaker_turn_next", whisper_full_get_segment_speaker_turn_next(ctx, i), true);
+                    }
+                end_obj(i == (n_segments - 1));
+            }
+
+        end_arr(true);
+    end_obj(true);
+    return true;
+}
+
+// karaoke video generation
+// outputs a bash script that uses ffmpeg to generate a video with the subtitles
+// TODO: font parameter adjustments
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    static const char * font = params.font_path.c_str();
+
+    std::ifstream fin(font);
+    if (!fin.is_open()) {
+        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
+        return false;
+    }
+
+    fout << "#!/bin/bash" << "\n";
+    fout << "\n";
+
+    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";
+
+    for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+        const int n = whisper_full_n_tokens(ctx, i);
+
+        std::vector<whisper_token_data> tokens(n);
+        for (int j = 0; j < n; ++j) {
+            tokens[j] = whisper_full_get_token_data(ctx, i, j);
+        }
+
+        if (i > 0) {
+            fout << ",";
+        }
+
+        // background text
+        fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
+
+        bool is_first = true;
+        std::string speaker = "";
+
+        if (params.diarize && pcmf32s.size() == 2) {
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        for (int j = 0; j < n; ++j) {
+            const auto & token = tokens[j];
+
+            if (tokens[j].id >= whisper_token_eot(ctx)) {
+                continue;
+            }
+
+            std::string txt_bg = "";
+            std::string txt_fg = ""; // highlight token
+            std::string txt_ul = ""; // underline
+
+            if (params.diarize && pcmf32s.size() == 2) {
+                txt_bg = speaker;
+                txt_fg = speaker;
+                txt_ul = "\\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ ";
+            }
+
+            txt_bg.append("> ");
+            txt_fg.append("> ");
+            txt_ul.append("\\ \\ ");
+
+            {
+                for (int k = 0; k < n; ++k) {
+                    const auto & token2 = tokens[k];
+
+                    if (tokens[k].id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+
+                    const std::string txt = whisper_token_to_str(ctx, token2.id);
+
+                    txt_bg += txt;
+
+                    if (k == j) {
+                        for (int l = 0; l < (int) txt.size(); ++l) {
+                            txt_fg += txt[l];
+                            txt_ul += "_";
+                        }
+                        txt_fg += "|";
+                    } else {
+                        for (int l = 0; l < (int) txt.size(); ++l) {
+                            txt_fg += "\\ ";
+                            txt_ul += "\\ ";
+                        }
+                    }
+                }
+
+                ::replace_all(txt_bg, "'", "\u2019");
+                ::replace_all(txt_bg, "\"", "\\\"");
+                ::replace_all(txt_fg, "'", "\u2019");
+                ::replace_all(txt_fg, "\"", "\\\"");
+            }
+
+            if (is_first) {
+                // background text
+                fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
+                is_first = false;
+            }
+
+            // foreground text
+            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
+
+            // underline
+            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
+        }
+    }
+
+    fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";
+
+    fout << "\n\n";
+    fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
+    fout << "\n";
+    fout << "echo \"  ffplay " << fname_inp << ".mp4\"\n";
+    fout << "\n";
+
+    fout.close();
+
+    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
+
+    return true;
+}
+
+bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    fout << "[by:whisper.cpp]\n";
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        const int64_t t = whisper_full_get_segment_t0(ctx, i);
+
+        int64_t msec = t * 10;
+        int64_t min = msec / (1000 * 60);
+        msec = msec - min * (1000 * 60);
+        int64_t sec = msec / 1000;
+        msec = msec - sec * 1000;
+
+        char buf[16];
+        snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) ( msec / 10));
+        std::string timestamp_lrc = std::string(buf);
+        std::string speaker = "";
+
+        if (params.diarize && pcmf32s.size() == 2)
+        {
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        fout <<  '[' << timestamp_lrc << ']' << speaker << text << "\n";
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        whisper_print_usage(argc, argv, params);
+        return 1;
+    }
+
+    if (params.fname_inp.empty()) {
+        fprintf(stderr, "error: no input files specified\n");
+        whisper_print_usage(argc, argv, params);
+        return 2;
+    }
+
+    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    if (params.diarize && params.tinydiarize) {
+        fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+
+    if (ctx == nullptr) {
+        fprintf(stderr, "error: failed to initialize whisper context\n");
+        return 3;
+    }
+
+    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
+    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
+
+    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
+        const auto fname_inp = params.fname_inp[f];
+		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
+
+        std::vector<float> pcmf32;               // mono-channel F32 PCM
+        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+
+        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
+            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
+            continue;
+        }
+
+        // print system information
+        {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
+        }
+
+        // print some info about the processing
+        {
+            fprintf(stderr, "\n");
+            if (!whisper_is_multilingual(ctx)) {
+                if (params.language != "en" || params.translate) {
+                    params.language = "en";
+                    params.translate = false;
+                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+                }
+            }
+            if (params.detect_language) {
+                params.language = "auto";
+            }
+            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
+                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
+                    params.n_threads, params.n_processors,
+                    params.language.c_str(),
+                    params.translate ? "translate" : "transcribe",
+                    params.tinydiarize ? "tdrz = 1, " : "",
+                    params.no_timestamps ? 0 : 1);
+
+            fprintf(stderr, "\n");
+        }
+
+        // run the inference
+        {
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
+
+            wparams.print_realtime   = false;
+            wparams.print_progress   = params.print_progress;
+            wparams.print_timestamps = !params.no_timestamps;
+            wparams.print_special    = params.print_special;
+            wparams.translate        = params.translate;
+            wparams.language         = params.language.c_str();
+            wparams.detect_language  = params.detect_language;
+            wparams.n_threads        = params.n_threads;
+            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
+            wparams.offset_ms        = params.offset_t_ms;
+            wparams.duration_ms      = params.duration_ms;
+
+            wparams.token_timestamps = params.output_wts || params.max_len > 0;
+            wparams.thold_pt         = params.word_thold;
+            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
+            wparams.split_on_word    = params.split_on_word;
+
+            wparams.speed_up         = params.speed_up;
+            wparams.debug_mode       = params.debug_mode;
+
+            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
+
+            wparams.initial_prompt   = params.prompt.c_str();
+
+            wparams.greedy.best_of        = params.best_of;
+            wparams.beam_search.beam_size = params.beam_size;
+
+            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
+            wparams.entropy_thold    = params.entropy_thold;
+            wparams.logprob_thold    = params.logprob_thold;
+
+            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
+
+            // this callback is called on each new segment
+            if (!wparams.print_realtime) {
+                wparams.new_segment_callback           = whisper_print_segment_callback;
+                wparams.new_segment_callback_user_data = &user_data;
+            }
+
+            if (wparams.print_progress) {
+                wparams.progress_callback           = whisper_print_progress_callback;
+                wparams.progress_callback_user_data = &user_data;
+            }
+
+            // example for abort mechanism
+            // in this example, we do not abort the processing, but we could if the flag is set to true
+            // the callback is called before every encoder run - if it returns false, the processing is aborted
+            {
+                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
+
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                    bool is_aborted = *(bool*)user_data;
+                    return !is_aborted;
+                };
+                wparams.encoder_begin_callback_user_data = &is_aborted;
+            }
+
+            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
+                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                return 10;
+            }
+        }
+
+        // output stuff
+        {
+            printf("\n");
+
+            // output to text file
+            if (params.output_txt) {
+                const auto fname_txt = fname_out + ".txt";
+                output_txt(ctx, fname_txt.c_str(), params, pcmf32s);
+            }
+
+            // output to VTT file
+            if (params.output_vtt) {
+                const auto fname_vtt = fname_out + ".vtt";
+                output_vtt(ctx, fname_vtt.c_str(), params, pcmf32s);
+            }
+
+            // output to SRT file
+            if (params.output_srt) {
+                const auto fname_srt = fname_out + ".srt";
+                output_srt(ctx, fname_srt.c_str(), params, pcmf32s);
+            }
+
+            // output to WTS file
+            if (params.output_wts) {
+                const auto fname_wts = fname_out + ".wts";
+                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, pcmf32s);
+            }
+
+            // output to CSV file
+            if (params.output_csv) {
+                const auto fname_csv = fname_out + ".csv";
+                output_csv(ctx, fname_csv.c_str(), params, pcmf32s);
+            }
+
+            // output to JSON file
+            if (params.output_jsn) {
+                const auto fname_jsn = fname_out + ".json";
+                output_json(ctx, fname_jsn.c_str(), params, pcmf32s);
+            }
+
+            // output to LRC file
+            if (params.output_lrc) {
+                const auto fname_lrc = fname_out + ".lrc";
+                output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
+            }
+
+            // output to score file
+            if (params.log_score) {
+                const auto fname_score = fname_out + ".score.txt";
+                output_score(ctx, fname_score.c_str(), params, pcmf32s);
+            }
+        }
+    }
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/whisper/quantize.cpp b/stable-diffusion.cpp/ggml/examples/whisper/quantize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b01d61431086eaab0fcdd73418e95077fa4750b6
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/whisper/quantize.cpp
@@ -0,0 +1,223 @@
+#include "ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (Whisper tiny)
+struct whisper_hparams {
+    int32_t n_vocab       = 51864;
+    int32_t n_audio_ctx   = 1500;
+    int32_t n_audio_state = 384;
+    int32_t n_audio_head  = 6;
+    int32_t n_audio_layer = 4;
+    int32_t n_text_ctx    = 448;
+    int32_t n_text_state  = 384;
+    int32_t n_text_head   = 6;
+    int32_t n_text_layer  = 4;
+    int32_t n_mels        = 80;
+    int32_t ftype         = 1;
+};
+
+struct whisper_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+
+    std::vector<float> data;
+};
+
+// quantize a model
+bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    whisper_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        finp.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        finp.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        finp.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        finp.read((char *) &hparams.ftype,         sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
+        fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
+        fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
+        fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
+        fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
+        fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
+        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
+        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
+        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
+        fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
+        fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
+        fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
+        fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((const char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        fout.write((const char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        fout.write((const char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        fout.write((const char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        fout.write((const char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        fout.write((const char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        fout.write((const char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        fout.write((const char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        fout.write((const char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        fout.write((const char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        fout.write((const char *) &ftype_dst,             sizeof(hparams.ftype));
+    }
+
+    // load mel filters
+    {
+        whisper_filters filters;
+
+        finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
+        fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
+        finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
+        fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
+
+        filters.data.resize(filters.n_mel * filters.n_fft);
+        finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
+        fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        finp.read ((char *) &n_vocab, sizeof(n_vocab));
+        fout.write((char *) &n_vocab, sizeof(n_vocab));
+
+        //if (n_vocab != hparams.n_vocab) {
+        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+        //            __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+        //    return false;
+        //}
+
+        char word[129];
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word[len] = '\0';
+
+            finp.read ((char *) word, len);
+            fout.write((char *) word, len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to not be quantized
+    const std::vector<std::string> to_skip = {
+        //"encoder.*",
+        "encoder.conv1.bias",
+        "encoder.conv2.bias",
+        "encoder.positional_embedding",
+        "decoder.positional_embedding",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/whisper/whisper.cpp b/stable-diffusion.cpp/ggml/examples/whisper/whisper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1224be9bf51bcb4ae10dc5adfef9015aca807da0
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/whisper/whisper.cpp
@@ -0,0 +1,5818 @@
+#include "whisper.h"
+#ifdef WHISPER_USE_COREML
+#include "coreml/whisper-encoder.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#  include "ggml-metal.h"
+#endif
+
+#ifdef WHISPER_USE_OPENVINO
+#include "openvino/whisper-openvino-encoder.h"
+#endif
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#include <algorithm>
+#include <cassert>
+#define _USE_MATH_DEFINES
+#include <cmath>
+#include <cstdio>
+#include <cstdarg>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <set>
+#include <string>
+#include <thread>
+#include <vector>
+#include <regex>
+#include <random>
+#include <functional>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if defined(GGML_BIG_ENDIAN)
+#include <bit>
+
+template<typename T>
+static T byteswap(T value) {
+    return std::byteswap(value);
+}
+
+template<>
+float byteswap(float value) {
+    return std::bit_cast<float>(byteswap(std::bit_cast<std::uint32_t>(value)));
+}
+
+template<typename T>
+static void byteswap_tensor_data(ggml_tensor * tensor) {
+    T * datum = reinterpret_cast<T *>(tensor->data);
+    for (int i = 0; i < ggml_nelements(tensor); i++) {
+        datum[i] = byteswap(datum[i]);
+    }
+}
+
+static void byteswap_tensor(ggml_tensor * tensor) {
+    switch (tensor->type) {
+        case GGML_TYPE_I16: {
+            byteswap_tensor_data<int16_t>(tensor);
+            break;
+        }
+        case GGML_TYPE_F16: {
+            byteswap_tensor_data<ggml_fp16_t>(tensor);
+            break;
+        }
+        case GGML_TYPE_I32: {
+            byteswap_tensor_data<int32_t>(tensor);
+            break;
+        }
+        case GGML_TYPE_F32: {
+            byteswap_tensor_data<float>(tensor);
+            break;
+        }
+        default: { // GML_TYPE_I8
+            break;
+        }
+    }
+}
+
+#define BYTESWAP_VALUE(d) d = byteswap(d)
+#define BYTESWAP_FILTERS(f)            \
+    do {                              \
+        for (auto & datum : f.data) { \
+            datum = byteswap(datum);  \
+        }                             \
+    } while (0)
+#define BYTESWAP_TENSOR(t)       \
+    do {                         \
+        byteswap_tensor(t); \
+    } while (0)
+#else
+#define BYTESWAP_VALUE(d) do {} while (0)
+#define BYTESWAP_FILTERS(f) do {} while (0)
+#define BYTESWAP_TENSOR(t) do {} while (0)
+#endif
+
+#define WHISPER_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            log("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+// define this to enable verbose trace logging - useful for debugging purposes
+//#define WHISPER_DEBUG
+
+#if defined(WHISPER_DEBUG)
+#define WHISPER_PRINT_DEBUG(...) \
+    do { \
+        fprintf(stderr, __VA_ARGS__); \
+    } while (0)
+#else
+#define WHISPER_PRINT_DEBUG(...)
+#endif
+
+//#define WHISPER_USE_FLASH_ATTN
+//#define WHISPER_USE_FLASH_FF
+#define WHISPER_MAX_DECODERS 16
+
+//
+// ggml helpers
+//
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
+// the idea is to represent the original matrix multiplication:
+//
+//   Z = X @ Y
+//
+// with the sum of two matrix multiplications:
+//
+//   Z = (X_0 @ Y_0) + (X_1 @ Y_1)
+//
+// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
+// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
+// general-purpose kernels
+//
+static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
+    // use padding only if dimension 0 is at least 8 times larger than the padding
+    // else we won't get much benefit from the optimization
+    const int n_pad_req = 8;
+
+    if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
+        return ggml_mul_mat(ctx, x, y);
+    }
+
+    struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
+    struct ggml_tensor * x_1 = ggml_view_3d(ctx, x,  x->ne[0]%pad,      x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
+
+    struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
+    struct ggml_tensor * y_1 = ggml_view_3d(ctx, y,  y->ne[0]%pad,      y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
+
+    return ggml_add(ctx,
+            ggml_mul_mat(ctx, x_0, y_0),
+            ggml_mul_mat(ctx, x_1, y_1));
+}
+
+// TODO: check if other platforms can benefit from this optimization
+#if defined(GGML_USE_METAL)
+#define ggml_mul_mat ggml_mul_mat_pad
+#endif
+
+// available whisper models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_TINY,
+    MODEL_BASE,
+    MODEL_SMALL,
+    MODEL_MEDIUM,
+    MODEL_LARGE,
+};
+
+static const std::map<std::string, std::pair<int, std::string>> g_lang = {
+    { "en",  { 0,  "english",         } },
+    { "zh",  { 1,  "chinese",         } },
+    { "de",  { 2,  "german",          } },
+    { "es",  { 3,  "spanish",         } },
+    { "ru",  { 4,  "russian",         } },
+    { "ko",  { 5,  "korean",          } },
+    { "fr",  { 6,  "french",          } },
+    { "ja",  { 7,  "japanese",        } },
+    { "pt",  { 8,  "portuguese",      } },
+    { "tr",  { 9,  "turkish",         } },
+    { "pl",  { 10, "polish",          } },
+    { "ca",  { 11,  "catalan",        } },
+    { "nl",  { 12,  "dutch",          } },
+    { "ar",  { 13,  "arabic",         } },
+    { "sv",  { 14,  "swedish",        } },
+    { "it",  { 15,  "italian",        } },
+    { "id",  { 16,  "indonesian",     } },
+    { "hi",  { 17,  "hindi",          } },
+    { "fi",  { 18,  "finnish",        } },
+    { "vi",  { 19,  "vietnamese",     } },
+    { "he",  { 20,  "hebrew",         } },
+    { "uk",  { 21,  "ukrainian",      } },
+    { "el",  { 22,  "greek",          } },
+    { "ms",  { 23,  "malay",          } },
+    { "cs",  { 24,  "czech",          } },
+    { "ro",  { 25,  "romanian",       } },
+    { "da",  { 26,  "danish",         } },
+    { "hu",  { 27,  "hungarian",      } },
+    { "ta",  { 28,  "tamil",          } },
+    { "no",  { 29,  "norwegian",      } },
+    { "th",  { 30,  "thai",           } },
+    { "ur",  { 31,  "urdu",           } },
+    { "hr",  { 32,  "croatian",       } },
+    { "bg",  { 33,  "bulgarian",      } },
+    { "lt",  { 34,  "lithuanian",     } },
+    { "la",  { 35,  "latin",          } },
+    { "mi",  { 36,  "maori",          } },
+    { "ml",  { 37,  "malayalam",      } },
+    { "cy",  { 38,  "welsh",          } },
+    { "sk",  { 39,  "slovak",         } },
+    { "te",  { 40,  "telugu",         } },
+    { "fa",  { 41,  "persian",        } },
+    { "lv",  { 42,  "latvian",        } },
+    { "bn",  { 43,  "bengali",        } },
+    { "sr",  { 44,  "serbian",        } },
+    { "az",  { 45,  "azerbaijani",    } },
+    { "sl",  { 46,  "slovenian",      } },
+    { "kn",  { 47,  "kannada",        } },
+    { "et",  { 48,  "estonian",       } },
+    { "mk",  { 49,  "macedonian",     } },
+    { "br",  { 50,  "breton",         } },
+    { "eu",  { 51,  "basque",         } },
+    { "is",  { 52,  "icelandic",      } },
+    { "hy",  { 53,  "armenian",       } },
+    { "ne",  { 54,  "nepali",         } },
+    { "mn",  { 55,  "mongolian",      } },
+    { "bs",  { 56,  "bosnian",        } },
+    { "kk",  { 57,  "kazakh",         } },
+    { "sq",  { 58,  "albanian",       } },
+    { "sw",  { 59,  "swahili",        } },
+    { "gl",  { 60,  "galician",       } },
+    { "mr",  { 61,  "marathi",        } },
+    { "pa",  { 62,  "punjabi",        } },
+    { "si",  { 63,  "sinhala",        } },
+    { "km",  { 64,  "khmer",          } },
+    { "sn",  { 65,  "shona",          } },
+    { "yo",  { 66,  "yoruba",         } },
+    { "so",  { 67,  "somali",         } },
+    { "af",  { 68,  "afrikaans",      } },
+    { "oc",  { 69,  "occitan",        } },
+    { "ka",  { 70,  "georgian",       } },
+    { "be",  { 71,  "belarusian",     } },
+    { "tg",  { 72,  "tajik",          } },
+    { "sd",  { 73,  "sindhi",         } },
+    { "gu",  { 74,  "gujarati",       } },
+    { "am",  { 75,  "amharic",        } },
+    { "yi",  { 76,  "yiddish",        } },
+    { "lo",  { 77,  "lao",            } },
+    { "uz",  { 78,  "uzbek",          } },
+    { "fo",  { 79,  "faroese",        } },
+    { "ht",  { 80,  "haitian creole", } },
+    { "ps",  { 81,  "pashto",         } },
+    { "tk",  { 82,  "turkmen",        } },
+    { "nn",  { 83,  "nynorsk",        } },
+    { "mt",  { 84,  "maltese",        } },
+    { "sa",  { 85,  "sanskrit",       } },
+    { "lb",  { 86,  "luxembourgish",  } },
+    { "my",  { 87,  "myanmar",        } },
+    { "bo",  { 88,  "tibetan",        } },
+    { "tl",  { 89,  "tagalog",        } },
+    { "mg",  { 90,  "malagasy",       } },
+    { "as",  { 91,  "assamese",       } },
+    { "tt",  { 92,  "tatar",          } },
+    { "haw", { 93,  "hawaiian",       } },
+    { "ln",  { 94,  "lingala",        } },
+    { "ha",  { 95,  "hausa",          } },
+    { "ba",  { 96,  "bashkir",        } },
+    { "jw",  { 97,  "javanese",       } },
+    { "su",  { 98,  "sundanese",      } },
+};
+
+static const size_t MB = 1ull*1024*1024;
+
+// TODO: avoid using GGUF
+static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
+    { GGML_TYPE_F32,
+        {
+            { MODEL_TINY,     74ull*MB },
+            { MODEL_BASE,    142ull*MB },
+            { MODEL_SMALL,   466ull*MB },
+            { MODEL_MEDIUM, 1464ull*MB },
+            { MODEL_LARGE,  2952ull*MB },
+        },
+    },
+    { GGML_TYPE_F16,
+        {
+            { MODEL_TINY,     74ull*MB },
+            { MODEL_BASE,    142ull*MB },
+            { MODEL_SMALL,   466ull*MB },
+            { MODEL_MEDIUM, 1464ull*MB },
+            { MODEL_LARGE,  2952ull*MB },
+        },
+    },
+    { GGML_TYPE_Q4_0,
+        {
+            { MODEL_TINY,     26ull*MB },
+            { MODEL_BASE,     50ull*MB },
+            { MODEL_SMALL,   154ull*MB },
+            { MODEL_MEDIUM,  470ull*MB },
+            { MODEL_LARGE,   940ull*MB },
+        },
+    },
+    { GGML_TYPE_Q4_1,
+        {
+            { MODEL_TINY,     32ull*MB },
+            { MODEL_BASE,     58ull*MB },
+            { MODEL_SMALL,   182ull*MB },
+            { MODEL_MEDIUM,  562ull*MB },
+            { MODEL_LARGE,  1124ull*MB },
+        },
+    },
+    { GGML_TYPE_Q5_0,
+        {
+            { MODEL_TINY,     30ull*MB },
+            { MODEL_BASE,     54ull*MB },
+            { MODEL_SMALL,   170ull*MB },
+            { MODEL_MEDIUM,  516ull*MB },
+            { MODEL_LARGE,  1034ull*MB },
+        },
+    },
+    { GGML_TYPE_Q5_1,
+        {
+            { MODEL_TINY,     32ull*MB },
+            { MODEL_BASE,     58ull*MB },
+            { MODEL_SMALL,   182ull*MB },
+            { MODEL_MEDIUM,  562ull*MB },
+            { MODEL_LARGE,  1124ull*MB },
+        },
+    },
+    { GGML_TYPE_Q8_0,
+        {
+            { MODEL_TINY,     45ull*MB },
+            { MODEL_BASE,     84ull*MB },
+            { MODEL_SMALL,   268ull*MB },
+            { MODEL_MEDIUM,  834ull*MB },
+            { MODEL_LARGE,  1674ull*MB },
+        },
+    },
+};
+
+struct whisper_mel {
+    int n_len;
+    int n_len_org;
+    int n_mel;
+
+    std::vector<float> data;
+};
+
+struct whisper_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+
+    std::vector<float> data;
+};
+
+struct whisper_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    int n_vocab = 51864;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+
+    // reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
+    id token_eot        = 50256;
+    id token_sot        = 50257;
+    // task tokens (used only for multilingual models)
+    id token_translate  = 50357;
+    id token_transcribe = 50358;
+    // other special tokens
+    id token_solm       = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn
+    id token_prev       = 50360;
+    id token_nosp       = 50361;
+    id token_not        = 50362; // no timestamps
+    id token_beg        = 50363; // begin timestamps
+
+    bool is_multilingual() const {
+        return n_vocab == 51865;
+    }
+};
+
+struct whisper_segment {
+    int64_t t0;
+    int64_t t1;
+
+    std::string text;
+
+    std::vector<whisper_token_data> tokens;
+
+    bool speaker_turn_next;
+};
+
+// medium
+// hparams: {
+// 'n_mels': 80,
+// 'n_vocab': 51864,
+// 'n_audio_ctx': 1500,
+// 'n_audio_state': 1024,
+// 'n_audio_head': 16,
+// 'n_audio_layer': 24,
+// 'n_text_ctx': 448,
+// 'n_text_state': 1024,
+// 'n_text_head': 16,
+// 'n_text_layer': 24
+// }
+//
+// default hparams (Whisper tiny)
+struct whisper_hparams {
+    int32_t n_vocab       = 51864;
+    int32_t n_audio_ctx   = 1500;
+    int32_t n_audio_state = 384;
+    int32_t n_audio_head  = 6;
+    int32_t n_audio_layer = 4;
+    int32_t n_text_ctx    = 448;
+    int32_t n_text_state  = 384;
+    int32_t n_text_head   = 6;
+    int32_t n_text_layer  = 4;
+    int32_t n_mels        = 80;
+    int32_t ftype         = 1;
+    float   eps           = 1e-5f;
+};
+
+// audio encoding layer
+struct whisper_layer_encoder {
+    // encoder.blocks.*.attn_ln
+    struct ggml_tensor * attn_ln_0_w;
+    struct ggml_tensor * attn_ln_0_b;
+
+    // encoder.blocks.*.attn.out
+    struct ggml_tensor * attn_ln_1_w;
+    struct ggml_tensor * attn_ln_1_b;
+
+    // encoder.blocks.*.attn.query
+    struct ggml_tensor * attn_q_w;
+    struct ggml_tensor * attn_q_b;
+
+    // encoder.blocks.*.attn.key
+    struct ggml_tensor * attn_k_w;
+
+    // encoder.blocks.*.attn.value
+    struct ggml_tensor * attn_v_w;
+    struct ggml_tensor * attn_v_b;
+
+    // encoder.blocks.*.mlp_ln
+    struct ggml_tensor * mlp_ln_w;
+    struct ggml_tensor * mlp_ln_b;
+
+    // encoder.blocks.*.mlp.0
+    struct ggml_tensor * mlp_0_w;
+    struct ggml_tensor * mlp_0_b;
+
+    // encoder.blocks.*.mlp.2
+    struct ggml_tensor * mlp_1_w;
+    struct ggml_tensor * mlp_1_b;
+};
+
+// token decoding layer
+struct whisper_layer_decoder {
+    // decoder.blocks.*.attn_ln
+    struct ggml_tensor * attn_ln_0_w;
+    struct ggml_tensor * attn_ln_0_b;
+
+    // decoder.blocks.*.attn.out
+    struct ggml_tensor * attn_ln_1_w;
+    struct ggml_tensor * attn_ln_1_b;
+
+    // decoder.blocks.*.attn.query
+    struct ggml_tensor * attn_q_w;
+    struct ggml_tensor * attn_q_b;
+
+    // decoder.blocks.*.attn.key
+    struct ggml_tensor * attn_k_w;
+
+    // decoder.blocks.*.attn.value
+    struct ggml_tensor * attn_v_w;
+    struct ggml_tensor * attn_v_b;
+
+    // decoder.blocks.*.cross_attn_ln
+    struct ggml_tensor * cross_attn_ln_0_w;
+    struct ggml_tensor * cross_attn_ln_0_b;
+
+    // decoder.blocks.*.cross_attn.out
+    struct ggml_tensor * cross_attn_ln_1_w;
+    struct ggml_tensor * cross_attn_ln_1_b;
+
+    // decoder.blocks.*.cross_attn.query
+    struct ggml_tensor * cross_attn_q_w;
+    struct ggml_tensor * cross_attn_q_b;
+
+    // decoder.blocks.*.cross_attn.key
+    struct ggml_tensor * cross_attn_k_w;
+
+    // decoder.blocks.*.cross_attn.value
+    struct ggml_tensor * cross_attn_v_w;
+    struct ggml_tensor * cross_attn_v_b;
+
+    // decoder.blocks.*.mlp_ln
+    struct ggml_tensor * mlp_ln_w;
+    struct ggml_tensor * mlp_ln_b;
+
+    // decoder.blocks.*.mlp.0
+    struct ggml_tensor * mlp_0_w;
+    struct ggml_tensor * mlp_0_b;
+
+    // decoder.blocks.*.mlp.2
+    struct ggml_tensor * mlp_1_w;
+    struct ggml_tensor * mlp_1_b;
+};
+
+struct whisper_kv_cache {
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    struct ggml_context * ctx;
+
+    // buf points to the memory allocated for both ggml_tensor 'k' and 'v' (see kv_cache_init)
+    std::vector<uint8_t> buf;
+
+    int n; // number of tokens currently in the cache
+};
+
+struct whisper_model {
+    e_model type = MODEL_UNKNOWN;
+
+    whisper_hparams hparams;
+    whisper_filters filters;
+
+    // encoder.positional_embedding
+    struct ggml_tensor * e_pe;
+
+    // encoder.conv1
+    struct ggml_tensor * e_conv_1_w;
+    struct ggml_tensor * e_conv_1_b;
+
+    // encoder.conv2
+    struct ggml_tensor * e_conv_2_w;
+    struct ggml_tensor * e_conv_2_b;
+
+    // encoder.ln_post
+    struct ggml_tensor * e_ln_w;
+    struct ggml_tensor * e_ln_b;
+
+    // decoder.positional_embedding
+    struct ggml_tensor * d_pe;
+
+    // decoder.token_embedding
+    struct ggml_tensor * d_te;
+
+    // decoder.ln
+    struct ggml_tensor * d_ln_w;
+    struct ggml_tensor * d_ln_b;
+
+    std::vector<whisper_layer_encoder> layers_encoder;
+    std::vector<whisper_layer_decoder> layers_decoder;
+
+    // context
+    struct ggml_context * ctx;
+
+    // the model memory buffer is read-only and can be shared between processors
+    std::vector<uint8_t> * buf;
+
+    // tensors
+    int n_loaded;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+struct whisper_sequence {
+    std::vector<whisper_token_data> tokens;
+
+    // the accumulated transcription in the current iteration (used to truncate the tokens array)
+    int result_len;
+
+    double sum_logprobs_all; // the sum of the log probabilities of the tokens
+    double sum_logprobs;     // the sum of the log probabilities of the tokens (first result_len tokens)
+    double avg_logprobs;     // the average log probability of the tokens
+    double entropy;          // the entropy of the tokens
+    double score;            // likelihood rank score
+};
+
+// TAGS: WHISPER_DECODER_INIT
+struct whisper_decoder {
+    // each decoder keeps its own KV-cache
+    whisper_kv_cache kv_self;
+
+    // the currently generated sequence of tokens
+    whisper_sequence sequence;
+
+    int seek_delta; // the window shift found so far based on the decoded timestamp tokens
+
+    bool failed;    // has the current segment failed to decode?
+    bool completed; // has the decoder completed the current segment?
+    bool has_ts;    // have we already sampled a non-beg timestamp token for the current segment?
+
+    // new token probs, logits and logprobs after the last whisper_decode (1-dimensional array: [n_vocab])
+    std::vector<float> probs;
+    std::vector<float> logits;
+    std::vector<float> logprobs;
+
+    std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
+};
+
+// replace std::pair by using customized pair struct (reason: std::pair is very slow)
+template<typename A, typename B>
+struct whisper_pair {
+    A first;
+    B second;
+
+    // Define a constructor that takes two arguments.
+    whisper_pair(const A& a, const B& b) : first(a), second(b) {}
+    // Define a constructor that takes no argument.
+    whisper_pair() : first(A()), second(B()) {}
+};
+
+// beam-search helpers
+struct kv_buf {
+    std::vector<uint8_t> k;
+    std::vector<uint8_t> v;
+};
+
+// ggml_allocr wrapper for whisper usage
+struct whisper_allocr {
+    ggml_allocr * alloc = nullptr;
+
+    std::vector<uint8_t> meta;
+    std::vector<uint8_t> data;
+};
+
+static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
+    return allocr.meta.size() + allocr.data.size();
+}
+
+// measure the memory usage of a graph and prepare the allocr's internal data buffer
+static void whisper_allocr_graph_init(struct whisper_allocr & allocr, std::function<struct ggml_cgraph *()> && get_graph) {
+    const int tensor_alignment = 32;
+
+    auto & alloc = allocr.alloc;
+    auto & meta  = allocr.meta;
+    auto & data  = allocr.data;
+
+    meta.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
+
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+
+    const size_t alloc_size = ggml_allocr_alloc_graph(alloc, get_graph()) + tensor_alignment;
+
+    ggml_allocr_free(alloc);
+
+    data.resize(alloc_size);
+
+    alloc = ggml_allocr_new(data.data(), data.size(), tensor_alignment);
+}
+
+static void whisper_allocr_free(struct whisper_allocr & allocr) {
+    if (allocr.alloc) {
+        ggml_allocr_free(allocr.alloc);
+        allocr.alloc = nullptr;
+    }
+}
+
+struct whisper_state {
+    int64_t t_sample_us = 0;
+    int64_t t_encode_us = 0;
+    int64_t t_decode_us = 0;
+    int64_t t_prompt_us = 0;
+    int64_t t_mel_us = 0;
+
+    int32_t n_sample = 0; // number of tokens sampled
+    int32_t n_encode = 0; // number of encoder calls
+    int32_t n_decode = 0; // number of decoder calls with n_tokens == 1 (text-generation)
+    int32_t n_prompt = 0; // number of decoder calls with n_tokens >  1 (prompt encoding)
+    int32_t n_fail_p = 0; // number of logprob threshold failures
+    int32_t n_fail_h = 0; // number of entropy threshold failures
+
+    // cross-attention KV cache for the decoders
+    // shared between all decoders
+    whisper_kv_cache kv_cross;
+    whisper_mel mel;
+
+    whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
+
+    // buffer for swapping KV caches between decoders during beam-search
+    std::vector<kv_buf> kv_swap_bufs;
+
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
+    // ggml-alloc:
+    // - stores meta info about the intermediate tensors into the `meta` buffers
+    // - stores the actual tensor data into the `data` buffers
+    whisper_allocr alloc_conv;
+    whisper_allocr alloc_encode;
+    whisper_allocr alloc_cross;
+    whisper_allocr alloc_decode;
+
+    // result of the encoder
+    struct ggml_tensor * embd_conv = nullptr;
+    struct ggml_tensor * embd_enc  = nullptr;
+
+    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    std::vector<float> logits;
+
+    std::vector<whisper_segment> result_all;
+    std::vector<whisper_token>   prompt_past;
+
+    // work container used to avoid memory allocations
+    std::vector<whisper_pair<double, whisper_vocab::id>> logits_id;
+
+    mutable std::mt19937 rng; // used for sampling at t > 0.0
+
+    int lang_id = 0; // english by default
+
+    std::string path_model; // populated by whisper_init_from_file()
+#ifdef WHISPER_USE_COREML
+    whisper_coreml_context * ctx_coreml = nullptr;
+#endif
+
+#ifdef GGML_USE_METAL
+    ggml_metal_context * ctx_metal = nullptr;
+#endif
+
+#ifdef WHISPER_USE_OPENVINO
+    whisper_openvino_context * ctx_openvino = nullptr;
+#endif
+
+    // [EXPERIMENTAL] token-level timestamps data
+    int64_t t_beg = 0;
+    int64_t t_last = 0;
+    whisper_token tid_last;
+    std::vector<float> energy; // PCM signal energy
+
+    // [EXPERIMENTAL] speed-up techniques
+    int32_t exp_n_audio_ctx = 0; // 0 - use default
+};
+
+struct whisper_context {
+    int64_t t_load_us  = 0;
+    int64_t t_start_us = 0;
+
+    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
+    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
+
+    whisper_model model;
+    whisper_vocab vocab;
+    whisper_state * state = nullptr;
+
+    std::string path_model; // populated by whisper_init_from_file()
+};
+
+static void whisper_default_log(const char * text) {
+    fprintf(stderr, "%s", text);
+}
+
+static whisper_log_callback whisper_log = whisper_default_log;
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((gnu_format(printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static void log(const char * fmt, ...) {
+    if (!whisper_log) return;
+    char buf[1024];
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(buf, sizeof(buf), fmt, args);
+    whisper_log(buf);
+}
+
+template<typename T>
+static void read_safe(whisper_model_loader * loader, T & dest) {
+    loader->read(loader->context, &dest, sizeof(T));
+    BYTESWAP_VALUE(dest);
+}
+
+static bool kv_cache_init(
+        const struct whisper_hparams & hparams,
+             struct whisper_kv_cache & cache,
+                           ggml_type   wtype,
+                                 int   n_ctx) {
+    const int64_t n_text_state = hparams.n_text_state;
+    const int64_t n_text_layer = hparams.n_text_layer;
+
+    const int64_t n_mem      = n_text_layer*n_ctx;
+    const int64_t n_elements = n_text_state*n_mem;
+
+    const size_t mem_bytes = 2*(ggml_type_size(wtype)*n_elements + ggml_tensor_overhead());
+
+    cache.buf.resize(mem_bytes);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ cache.buf.size(),
+        /*.mem_buffer =*/ cache.buf.data(),
+        /*.no_alloc   =*/ false,
+    };
+
+    cache.ctx = ggml_init(params);
+
+    if (!cache.ctx) {
+        log("%s: failed to allocate memory for kv cache\n", __func__);
+        return false;
+    }
+
+    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+
+    return true;
+}
+
+static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
+    WHISPER_ASSERT(cache.ctx);
+
+    const int n_elements = ggml_nelements(cache.k);
+    WHISPER_ASSERT(n_elements == ggml_nelements(cache.v));
+
+    const ggml_type wtype = cache.k->type;
+    WHISPER_ASSERT(wtype == cache.v->type);
+
+    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ cache.buf.size(),
+        /*.mem_buffer =*/ cache.buf.data(),
+        /*.no_alloc   =*/ false,
+    };
+
+    cache.ctx = ggml_init(params);
+
+    if (!cache.ctx) {
+        log("%s: failed to allocate memory for kv cache\n", __func__);
+        return false;
+    }
+
+    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+
+    return true;
+}
+
+static void kv_cache_free(struct whisper_kv_cache & cache) {
+    if (cache.ctx) {
+        ggml_free(cache.ctx);
+        cache.ctx = nullptr;
+    }
+}
+
+// load the model from a ggml file
+//
+// file format:
+//
+//   - hparams
+//   - pre-computed mel filters
+//   - vocab
+//   - weights
+//
+// see the convert-pt-to-ggml.py script for details
+//
+static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
+    log("%s: loading model\n", __func__);
+
+    const int64_t t_start_us = ggml_time_us();
+
+    wctx.t_start_us = t_start_us;
+
+    auto & model = wctx.model;
+    auto & vocab = wctx.vocab;
+
+    // verify magic
+    {
+        uint32_t magic;
+        read_safe(loader, magic);
+        if (magic != GGML_FILE_MAGIC) {
+            log("%s: invalid model data (bad magic)\n", __func__);
+            return false;
+        }
+    }
+
+    //load hparams
+    {
+        auto & hparams = model.hparams;
+
+        read_safe(loader, hparams.n_vocab);
+        read_safe(loader, hparams.n_audio_ctx);
+        read_safe(loader, hparams.n_audio_state);
+        read_safe(loader, hparams.n_audio_head);
+        read_safe(loader, hparams.n_audio_layer);
+        read_safe(loader, hparams.n_text_ctx);
+        read_safe(loader, hparams.n_text_state);
+        read_safe(loader, hparams.n_text_head);
+        read_safe(loader, hparams.n_text_layer);
+        read_safe(loader, hparams.n_mels);
+        read_safe(loader, hparams.ftype);
+
+        assert(hparams.n_text_state == hparams.n_audio_state);
+
+        if (hparams.n_audio_layer == 4) {
+            model.type = e_model::MODEL_TINY;
+        }
+
+        if (hparams.n_audio_layer == 6) {
+            model.type = e_model::MODEL_BASE;
+        }
+
+        if (hparams.n_audio_layer == 12) {
+            model.type = e_model::MODEL_SMALL;
+        }
+
+        if (hparams.n_audio_layer == 24) {
+            model.type = e_model::MODEL_MEDIUM;
+        }
+
+        if (hparams.n_audio_layer == 32) {
+            model.type = e_model::MODEL_LARGE;
+        }
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+
+        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+        // in order to save memory and also to speed up the computation
+        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+        if (wctx.wtype == GGML_TYPE_COUNT) {
+            log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
+            return false;
+        }
+
+        const size_t scale = model.hparams.ftype ? 1 : 2;
+
+        log("%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
+        log("%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
+        log("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
+        log("%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
+        log("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
+        log("%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
+        log("%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
+        log("%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
+        log("%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
+        log("%s: n_mels        = %d\n", __func__, hparams.n_mels);
+        log("%s: ftype         = %d\n", __func__, model.hparams.ftype);
+        log("%s: qntvr         = %d\n", __func__, qntvr);
+        log("%s: type          = %d\n", __func__, model.type);
+
+        // print memory requirements
+        {
+            // TODO
+            //log("%s: mem required  = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
+            //        mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
+        }
+
+        // initialize all memory buffers
+        // always have at least one decoder
+
+        wctx.model.buf = new std::vector<uint8_t>();
+        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
+
+        // we skip initialization of the state until it is needed
+        // because it might be that state will always be provided externally.
+    }
+
+    // load mel filters
+    {
+        auto & filters = wctx.model.filters;
+
+        read_safe(loader, filters.n_mel);
+        read_safe(loader, filters.n_fft);
+
+        filters.data.resize(filters.n_mel * filters.n_fft);
+        loader->read(loader->context, filters.data.data(), filters.data.size() * sizeof(float));
+        BYTESWAP_FILTERS(filters);
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        read_safe(loader, n_vocab);
+
+        //if (n_vocab != model.hparams.n_vocab) {
+        //    log("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+        //            __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+        //    return false;
+        //}
+
+        std::string word;
+        std::vector<char> tmp;
+
+        tmp.reserve(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            read_safe(loader, len);
+
+            if (len > 0) {
+                tmp.resize(len);
+                loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
+                word.assign(&tmp[0], tmp.size());
+            } else {
+                // seems like we have an empty-string token in multi-language models (i = 50256)
+                //log("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
+                word = "";
+            }
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+
+            //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
+        }
+
+        vocab.n_vocab = model.hparams.n_vocab;
+        if (vocab.is_multilingual()) {
+            vocab.token_eot++;
+            vocab.token_sot++;
+            vocab.token_translate++;
+            vocab.token_transcribe++;
+            vocab.token_solm++;
+            vocab.token_prev++;
+            vocab.token_nosp++;
+            vocab.token_not++;
+            vocab.token_beg++;
+        }
+
+        if (n_vocab < model.hparams.n_vocab) {
+            log("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
+            for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
+                if (i > vocab.token_beg) {
+                    word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
+                } else if (i == vocab.token_eot) {
+                    word = "[_EOT_]";
+                } else if (i == vocab.token_sot) {
+                    word = "[_SOT_]";
+                } else if (i == vocab.token_solm) {
+                    word = "[_SOLM_]";
+                } else if (i == vocab.token_prev) {
+                    word = "[_PREV_]";
+                } else if (i == vocab.token_nosp) {
+                    word = "[_NOSP_]";
+                } else if (i == vocab.token_not) {
+                    word = "[_NOT_]";
+                } else if (i == vocab.token_beg) {
+                    word = "[_BEG_]";
+                } else {
+                    word = "[_extra_token_" + std::to_string(i) + "]";
+                }
+                vocab.token_to_id[word] = i;
+                vocab.id_to_token[i] = word;
+            }
+        }
+    }
+
+    size_t ctx_size = 0;
+
+    const ggml_type wtype = wctx.wtype;
+    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_vocab = hparams.n_vocab;
+
+        const int n_audio_ctx   = hparams.n_audio_ctx;
+        const int n_audio_state = hparams.n_audio_state;
+        const int n_audio_layer = hparams.n_audio_layer;
+
+        const int n_text_ctx   = hparams.n_text_ctx;
+        const int n_text_state = hparams.n_text_state;
+        const int n_text_layer = hparams.n_text_layer;
+
+        const int n_mels = hparams.n_mels;
+
+        // encoder
+        {
+            ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
+
+            ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype);         // e_conv_1_w
+            ctx_size +=          n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
+
+            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype);         // e_conv_2_w
+            ctx_size +=                 n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
+
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
+        }
+
+        // decoder
+        {
+            ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
+
+            ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
+
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
+        }
+
+        // encoder layers
+        {
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
+
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_0_w
+            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
+
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_1_w
+            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
+
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
+
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_q_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
+
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
+
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_v_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
+
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_ln_1_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
+        }
+
+        // decoder layers
+        {
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
+
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_0_w
+            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
+
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_1_w
+            ctx_size += n_text_layer*(               n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
+
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
+                                                                                                //
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
+
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
+        }
+
+        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
+
+        log("%s: model ctx     = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ wctx.model.buf->size(),
+            /*.mem_buffer =*/ wctx.model.buf->data(),
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            log("%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        auto & ctx = model.ctx;
+
+        const auto & hparams = model.hparams;
+
+        const int n_vocab = hparams.n_vocab;
+
+        const int n_audio_ctx   = hparams.n_audio_ctx;
+        const int n_audio_state = hparams.n_audio_state;
+        const int n_audio_layer = hparams.n_audio_layer;
+
+        const int n_text_ctx   = hparams.n_text_ctx;
+        const int n_text_state = hparams.n_text_state;
+        const int n_text_layer = hparams.n_text_layer;
+
+        const int n_mels = hparams.n_mels;
+
+        model.layers_encoder.resize(n_audio_layer);
+        model.layers_decoder.resize(n_text_layer);
+
+        // encoder
+        {
+            model.e_pe       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
+
+            model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype,         3, n_mels, n_audio_state);
+            model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
+
+            model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
+            model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
+
+            model.e_ln_w     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+            model.e_ln_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+
+            // map by name
+            model.tensors["encoder.positional_embedding"] = model.e_pe;
+
+            model.tensors["encoder.conv1.weight"]         = model.e_conv_1_w;
+            model.tensors["encoder.conv1.bias"]           = model.e_conv_1_b;
+
+            model.tensors["encoder.conv2.weight"]         = model.e_conv_2_w;
+            model.tensors["encoder.conv2.bias"]           = model.e_conv_2_b;
+
+            model.tensors["encoder.ln_post.weight"]       = model.e_ln_w;
+            model.tensors["encoder.ln_post.bias"]         = model.e_ln_b;
+
+            for (int i = 0; i < n_audio_layer; ++i) {
+                auto & layer = model.layers_encoder[i];
+
+                layer.mlp_ln_w    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.mlp_ln_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+
+                layer.mlp_0_w     = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, 4*n_audio_state);
+                layer.mlp_0_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
+
+                layer.mlp_1_w     = ggml_new_tensor_2d(ctx, wtype,         4*n_audio_state, n_audio_state);
+                layer.mlp_1_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+
+                layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+
+                layer.attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+
+                layer.attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+
+                layer.attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+
+                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+
+                // map by name
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]     = layer.mlp_ln_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]       = layer.mlp_ln_b;
+
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"]      = layer.mlp_0_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"]        = layer.mlp_0_b;
+
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"]      = layer.mlp_1_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"]        = layer.mlp_1_b;
+
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"]    = layer.attn_ln_0_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"]      = layer.attn_ln_0_b;
+
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"]   = layer.attn_q_b;
+
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"]   = layer.attn_k_w;
+
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"]   = layer.attn_v_b;
+
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"]   = layer.attn_ln_1_w;
+                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"]     = layer.attn_ln_1_b;
+            }
+        }
+
+        // decoder
+        {
+            model.d_pe   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
+
+            model.d_te   = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_vocab);
+
+            model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+            model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+
+            // map by name
+            model.tensors["decoder.positional_embedding"]   = model.d_pe;
+
+            model.tensors["decoder.token_embedding.weight"] = model.d_te;
+
+            model.tensors["decoder.ln.weight"]              = model.d_ln_w;
+            model.tensors["decoder.ln.bias"]                = model.d_ln_b;
+
+            for (int i = 0; i < n_text_layer; ++i) {
+                auto & layer = model.layers_decoder[i];
+
+                layer.mlp_ln_w          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.mlp_ln_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.mlp_0_w           = ggml_new_tensor_2d(ctx, wtype,           n_text_state, 4*n_text_state);
+                layer.mlp_0_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
+
+                layer.mlp_1_w           = ggml_new_tensor_2d(ctx, wtype,         4*n_text_state, n_text_state);
+                layer.mlp_1_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.attn_ln_0_w       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.attn_ln_0_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.attn_q_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_q_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.attn_k_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+
+                layer.attn_v_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_v_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.attn_ln_1_w       = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_ln_1_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.cross_attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.cross_attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+
+                layer.cross_attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+
+                // map by name
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]           = layer.mlp_ln_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]             = layer.mlp_ln_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"]            = layer.mlp_0_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"]              = layer.mlp_0_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"]            = layer.mlp_1_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"]              = layer.mlp_1_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"]          = layer.attn_ln_0_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"]            = layer.attn_ln_0_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"]       = layer.attn_q_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"]         = layer.attn_q_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"]         = layer.attn_k_w;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"]       = layer.attn_v_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"]         = layer.attn_v_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"]         = layer.attn_ln_1_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"]           = layer.attn_ln_1_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"]    = layer.cross_attn_ln_0_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"]      = layer.cross_attn_ln_0_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"]   = layer.cross_attn_q_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"]   = layer.cross_attn_k_w;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"]   = layer.cross_attn_v_b;
+
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"]   = layer.cross_attn_ln_1_w;
+                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"]     = layer.cross_attn_ln_1_b;
+            }
+        }
+    }
+
+    // load weights
+    {
+        size_t total_size = 0;
+
+        model.n_loaded = 0;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            read_safe(loader, n_dims);
+            read_safe(loader, length);
+            read_safe(loader, ttype);
+
+            if (loader->eof(loader->context)) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[4] = { 1, 1, 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                read_safe(loader, ne[i]);
+                nelements *= ne[i];
+            }
+
+            std::string name;
+            std::vector<char> tmp(length); // create a buffer
+            loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
+            name.assign(&tmp[0], tmp.size());
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                log("%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements) {
+                log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
+                        __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
+                log("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
+                return false;
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
+            BYTESWAP_TENSOR(tensor);
+
+            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_nbytes(tensor);
+            model.n_loaded++;
+        }
+
+        log("%s: model size    = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
+
+        if (model.n_loaded == 0) {
+            log("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+        } else if (model.n_loaded != (int) model.tensors.size()) {
+            log("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            return false;
+        }
+    }
+
+    wctx.t_load_us = ggml_time_us() - t_start_us;
+
+    return true;
+}
+
+static bool whisper_encode_external(const whisper_state & wstate) {
+    GGML_UNUSED(wstate);
+
+#ifndef WHISPER_USE_COREML
+    const bool use_coreml = false;
+#else
+    const bool use_coreml = wstate.ctx_coreml != nullptr;
+#endif
+
+#ifndef WHISPER_USE_OPENVINO
+    const bool use_openvino = false;
+#else
+    const bool use_openvino = wstate.ctx_openvino != nullptr;
+#endif
+
+    return use_coreml || use_openvino;
+}
+
+static struct ggml_cgraph * whisper_build_graph_conv(
+        whisper_context & wctx,
+          whisper_state & wstate,
+              const int   mel_offset) {
+    const auto & model   = wctx.model;
+    const auto & mel_inp = wstate.mel;
+    const auto & hparams = model.hparams;
+
+    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int n_state = hparams.n_audio_state; GGML_UNUSED(n_state);
+
+    const int n_mels = hparams.n_mels;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ wstate.alloc_conv.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_conv.meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    ggml_allocr * alloc = wstate.alloc_conv.alloc;
+
+    struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
+    ggml_allocr_alloc(alloc, mel);
+
+    assert(mel->type == GGML_TYPE_F32);
+    if (!ggml_allocr_is_measure(alloc)) {
+        assert(mel_inp.n_mel == n_mels);
+
+        float * dst = (float *) mel->data;
+        memset(dst, 0, ggml_nbytes(mel));
+
+        const int i0 = std::min(mel_offset, mel_inp.n_len);
+        const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
+
+        for (int j = 0; j < mel_inp.n_mel; ++j) {
+            for (int i = i0; i < i1; ++i) {
+                dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
+            }
+        }
+    }
+
+    struct ggml_tensor * cur = nullptr;
+
+    if (!whisper_encode_external(wstate)) {
+        // convolution + gelu
+        {
+            cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
+                        model.e_conv_1_b,
+                        cur),
+                    cur);
+
+            cur = ggml_gelu(ctx0, cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
+                        model.e_conv_2_b,
+                        cur),
+                    cur);
+
+            cur = ggml_gelu(ctx0, cur);
+        }
+
+        wstate.embd_conv = cur;
+    } else {
+#ifdef WHISPER_USE_COREML
+        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        ggml_allocr_alloc(alloc, cur);
+
+        if (!ggml_allocr_is_measure(alloc)) {
+            whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
+        }
+#endif
+#ifdef WHISPER_USE_OPENVINO
+        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        ggml_allocr_alloc(alloc, cur);
+
+        if (!ggml_allocr_is_measure(alloc)) {
+            whisper_openvino_encode(wstate.ctx_openvino, mel, cur);
+        }
+#endif
+
+        wstate.embd_enc = cur;
+    }
+
+    ggml_build_forward_expand(gf, cur);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+static struct ggml_cgraph * whisper_build_graph_encoder(
+        whisper_context & wctx,
+          whisper_state & wstate) {
+    const auto & model   = wctx.model;
+    const auto & hparams = model.hparams;
+
+    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int n_state = hparams.n_audio_state;
+    const int n_head  = hparams.n_audio_head;
+    const int n_layer = hparams.n_audio_layer;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ wstate.alloc_encode.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_encode.meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    ggml_allocr * alloc = wstate.alloc_encode.alloc;
+
+    struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(alloc, KQscale);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        ggml_set_f32(KQscale, 1.0f/sqrt(float(n_state)/n_head));
+    }
+
+    struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
+
+    // ===================================================================
+    // NOTE: experimenting with partial evaluation of the encoder (ignore)
+    //static int iter = -1;
+    //const int n_iter = 1500/n_ctx;
+
+    //iter = (iter + 1) % n_iter;
+
+    //if (iter == 0) {
+    //    memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
+    //    memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
+    //}
+
+    static int iter = 0;
+
+    const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
+    const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
+
+    struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
+
+    cur = ggml_add(ctx0, e_pe, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
+
+    // ===================================================================
+
+    // original:
+    //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
+
+    struct ggml_tensor * inpL = cur;
+
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers_encoder[il];
+
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0, cur, layer.attn_ln_0_w),
+                    layer.attn_ln_0_b);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+                    layer.attn_q_w,
+                    cur);
+
+            Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
+
+            //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+
+            // note: no bias for Key
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
+                    layer.attn_k_w,
+                    cur);
+
+            //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
+                    layer.attn_v_w,
+                    cur);
+
+            Vcur = ggml_add(ctx0, Vcur, layer.attn_v_b);
+
+            // ------
+
+#ifdef WHISPER_USE_FLASH_ATTN
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Kcur,
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * V =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                Vcur,
+                                n_state/n_head, n_head, n_ctx),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
+
+            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
+#else
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Kcur,
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale);
+
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
+
+            struct ggml_tensor * V =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                Vcur,
+                                n_state/n_head, n_head, n_ctx),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
+                        );
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+#endif
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
+        }
+
+        // projection
+        {
+            cur = ggml_mul_mat(ctx0,
+                    layer.attn_ln_1_w,
+                    cur);
+
+            cur = ggml_add(ctx0, cur, layer.attn_ln_1_b);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+
+                // cur = mlp_ln_w*cur + mlp_ln_b
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0, cur, layer.mlp_ln_w),
+                        layer.mlp_ln_b);
+            }
+
+#ifdef WHISPER_USE_FLASH_FF
+            cur = ggml_flash_ff(ctx0,
+                    ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
+                    layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
+#else
+            // fully connected
+            cur = ggml_mul_mat(ctx0,
+                    layer.mlp_0_w,
+                    cur);
+
+            cur = ggml_add(ctx0, cur, layer.mlp_0_b);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            cur = ggml_mul_mat(ctx0,
+                    layer.mlp_1_w,
+                    cur);
+
+            cur = ggml_add(ctx0, cur, layer.mlp_1_b);
+#endif
+        }
+
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    cur = inpL;
+
+    // norm
+    {
+        cur = ggml_norm(ctx0, cur, hparams.eps);
+
+        // cur = ln_f_g*cur + ln_f_b
+        cur = ggml_add(ctx0,
+                ggml_mul(ctx0, cur, model.e_ln_w),
+                model.e_ln_b);
+    }
+
+    ggml_build_forward_expand(gf, cur);
+
+    wstate.embd_enc = cur;
+
+    //ggml_graph_print(gf);
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
+    //        ggml_used_mem(ctx0)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// pre-compute cross-attention memory
+static struct ggml_cgraph * whisper_build_graph_cross(
+        whisper_context & wctx,
+          whisper_state & wstate) {
+    const auto & model   = wctx.model;
+    const auto & hparams = model.hparams;
+
+    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int n_state = hparams.n_audio_state;
+    const int n_head  = hparams.n_audio_head;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ wstate.alloc_cross.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_cross.meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    ggml_allocr * alloc = wstate.alloc_cross.alloc;
+
+    struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
+
+    struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(alloc, Kscale);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        ggml_set_f32(Kscale, pow(float(n_state) / n_head, -0.25));
+    }
+
+    for (int il = 0; il < model.hparams.n_text_layer; ++il) {
+        auto & layer = model.layers_decoder[il];
+
+        struct ggml_tensor* Kcross = ggml_mul_mat(ctx0,
+                layer.cross_attn_k_w,
+                cur);
+
+        Kcross = ggml_scale(ctx0, Kcross, Kscale);
+
+        struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
+                layer.cross_attn_v_w,
+                cur);
+
+        Vcross = ggml_add(ctx0,
+                    Vcross,
+                    layer.cross_attn_v_b);
+
+        Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
+
+        struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k,
+                n_state*n_ctx,
+                (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
+
+        struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
+                (   n_ctx)*ggml_element_size(wstate.kv_cross.v),
+                (il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
+
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcross, k));
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcross, v));
+    }
+
+    //ggml_graph_print(gf);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the encoder with the given state
+//
+// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
+// part of the transformer model and returns the encoded features
+//
+//   - wctx:      the model
+//   - wstate:     the state of the encoder
+//   - n_threads:  number of threads to use
+//   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
+//
+static bool whisper_encode_internal(
+        whisper_context & wctx,
+          whisper_state & wstate,
+              const int   mel_offset,
+              const int   n_threads) {
+    const int64_t t_start_us = ggml_time_us();
+
+    // conv
+    {
+        auto & alloc = wstate.alloc_conv.alloc;
+
+        ggml_allocr_reset(alloc);
+
+        ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
+
+        ggml_allocr_alloc_graph(alloc, gf);
+
+        if (!whisper_encode_external(wstate)) {
+            ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+        }
+    }
+
+    // encoder
+    if (!whisper_encode_external(wstate)) {
+        auto & alloc = wstate.alloc_encode.alloc;
+
+        ggml_allocr_reset(alloc);
+
+        ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
+
+        ggml_allocr_alloc_graph(alloc, gf);
+
+#ifdef GGML_USE_METAL
+        if (wstate.ctx_metal) {
+            ggml_metal_set_n_cb     (wstate.ctx_metal, n_threads);
+            ggml_metal_graph_compute(wstate.ctx_metal, gf);
+        } else {
+            ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+        }
+#else
+        ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+#endif
+    }
+
+    // cross
+    {
+        auto & alloc = wstate.alloc_cross.alloc;
+
+        ggml_allocr_reset(alloc);
+
+        ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
+
+        ggml_allocr_alloc_graph(alloc, gf);
+
+#ifdef GGML_USE_METAL
+        if (wstate.ctx_metal) {
+            ggml_metal_set_n_cb     (wstate.ctx_metal, n_threads);
+            ggml_metal_graph_compute(wstate.ctx_metal, gf);
+        } else {
+            ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+        }
+#else
+        ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+#endif
+    }
+
+    // ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    wstate.t_encode_us += ggml_time_us() - t_start_us;
+    wstate.n_encode++;
+
+    return true;
+}
+
+static struct ggml_cgraph * whisper_build_graph_decoder(
+         whisper_context & wctx,
+         whisper_state   & wstate,
+         whisper_decoder & decoder,
+     const whisper_token * tokens,
+                   int   n_tokens,
+                   int   n_past) {
+    const auto & model   = wctx.model;
+    const auto & hparams = model.hparams;
+
+    auto & kv_self = decoder.kv_self;
+
+    WHISPER_ASSERT(!!kv_self.ctx);
+
+    const int n_ctx   = hparams.n_text_ctx;
+    const int n_state = hparams.n_text_state;
+    const int n_head  = hparams.n_text_head;
+    const int n_layer = hparams.n_text_layer;
+
+    const int N = n_tokens;
+    const int M = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
+
+    //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ wstate.alloc_decode.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_decode.meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    ggml_allocr * alloc = wstate.alloc_decode.alloc;
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(alloc, embd);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        memcpy(embd->data, tokens, N*ggml_element_size(embd));
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(alloc, position);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        for (int i = 0; i < N; ++i) {
+            ((int32_t *) position->data)[i] = n_past + i;
+        }
+    }
+
+    struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(alloc, KQscale);
+
+    if (!ggml_allocr_is_measure(alloc)) {
+        ggml_set_f32(KQscale, pow(float(n_state)/n_head, -0.25));
+    }
+
+    // token encoding + position encoding
+    struct ggml_tensor * cur =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.d_te, embd),
+                ggml_get_rows(ctx0, model.d_pe, position));
+
+    struct ggml_tensor * inpL = cur;
+
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers_decoder[il];
+
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+
+            // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        cur,
+                        layer.attn_ln_0_w),
+                    layer.attn_ln_0_b);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+                    layer.attn_q_w,
+                    cur);
+
+            Qcur = ggml_add(ctx0,
+                        Qcur,
+                        layer.attn_q_b);
+
+            Qcur = ggml_scale(ctx0, Qcur, KQscale);
+
+            // note: no bias for Key
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
+                    layer.attn_k_w,
+                    cur);
+
+            Kcur = ggml_scale(ctx0, Kcur, KQscale);
+
+            // store key and value to memory
+            {
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
+                        layer.attn_v_w,
+                        cur);
+
+                Vcur = ggml_add(ctx0,
+                            Vcur,
+                            layer.attn_v_b);
+
+                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
+                        (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // ------
+
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, N),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, kv_self.k,
+                        n_state/n_head, n_past + N, n_head,
+                        ggml_element_size(kv_self.k)*n_state,
+                        ggml_element_size(kv_self.k)*n_state/n_head,
+                        ggml_element_size(kv_self.k)*n_state*n_ctx*il);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            //struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
+
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, kv_self.v,
+                        n_past + N, n_state/n_head, n_head,
+                        n_ctx*ggml_element_size(kv_self.v),
+                        n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
+                        il*n_ctx*ggml_element_size(kv_self.v)*n_state);
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
+        }
+
+        // projection
+        {
+            cur = ggml_mul_mat(ctx0,
+                    layer.attn_ln_1_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    layer.attn_ln_1_b);
+        }
+
+        // add the input
+        struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL);
+
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpCA, hparams.eps); // note: we use inpCA here
+
+            // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        cur,
+                        layer.cross_attn_ln_0_w),
+                    layer.cross_attn_ln_0_b);
+        }
+
+        // cross-attention
+        {
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+                    layer.cross_attn_q_w,
+                    cur);
+
+            Qcur = ggml_add(ctx0,
+                        Qcur,
+                        layer.cross_attn_q_b);
+
+            Qcur = ggml_scale(ctx0, Qcur, KQscale);
+
+            // Kcross is already scaled
+            struct ggml_tensor * Kcross =
+                ggml_view_3d(ctx0, wstate.kv_cross.k,
+                        n_state/n_head, M, n_head,
+                        ggml_element_size(wstate.kv_cross.k)*n_state,
+                        ggml_element_size(wstate.kv_cross.k)*n_state/n_head,
+                        ggml_element_size(wstate.kv_cross.k)*n_state*M*il);
+
+            //struct ggml_tensor * Vcross =
+            //    ggml_reshape_3d(ctx0,
+            //            ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
+            //            n_state/n_head, n_head, M);
+
+            //struct ggml_tensor * V_trans =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, wstate.kv_cross.v,
+                        M, n_state/n_head, n_head,
+                        M*ggml_element_size(wstate.kv_cross.v),
+                        M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
+                        il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
+
+            // ------
+
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, Q);
+
+            //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale(ctx0,
+            //            KQ,
+            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
+            //            );
+
+            // no masking for cross-attention
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_state, N)
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
+        }
+
+        // projection
+        {
+            cur = ggml_mul_mat(ctx0,
+                    layer.cross_attn_ln_1_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    layer.cross_attn_ln_1_b);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpCA);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+
+                // cur = mlp_ln_w*cur + mlp_ln_b
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            cur,
+                            layer.mlp_ln_w),
+                        layer.mlp_ln_b);
+            }
+
+            // fully connected
+            cur = ggml_mul_mat(ctx0,
+                    layer.mlp_0_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    layer.mlp_0_b);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            cur = ggml_mul_mat(ctx0,
+                    layer.mlp_1_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    layer.mlp_1_b);
+        }
+
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    cur = inpL;
+
+    // norm
+    {
+        cur = ggml_norm(ctx0, cur, hparams.eps);
+
+        cur = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    cur,
+                    model.d_ln_w),
+                model.d_ln_b);
+    }
+
+    // compute logits only for the last token
+    // comment this line to compute logits for all N tokens
+    // might be useful in the future
+    cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
+
+    struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
+
+    ggml_build_forward_expand(gf, logits);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the decoder
+//
+// given text prompt + audio features -> computes the logits for the next token
+//
+//   - model:      the model
+//   - n_threads:  number of threads to use
+//   - tokens:     text prompt
+//   - n_tokens:   number of tokens in the prompt
+//   - n_past:     number of past tokens to prefix the prompt with
+//
+static bool whisper_decode_internal(
+        whisper_context & wctx,
+          whisper_state & wstate,
+        whisper_decoder & decoder,
+    const whisper_token * tokens,
+              const int   n_tokens,
+              const int   n_past,
+              const int   n_threads) {
+    const int64_t t_start_us = ggml_time_us();
+
+    const auto & model   = wctx.model;
+    const auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_vocab;
+
+    auto & logits_out = wstate.logits;
+
+    struct ggml_tensor * logits;
+
+    // decoder
+    {
+        auto & alloc = wstate.alloc_decode.alloc;
+
+        ggml_allocr_reset(alloc);
+
+        ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, decoder, tokens, n_tokens, n_past);
+
+        ggml_allocr_alloc_graph(alloc, gf);
+
+        logits = gf->nodes[gf->n_nodes - 1];
+
+#ifdef GGML_USE_METAL
+        if (wstate.ctx_metal) {
+            ggml_metal_set_n_cb     (wstate.ctx_metal, n_threads);
+            ggml_metal_graph_compute(wstate.ctx_metal, gf);
+        } else {
+            ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+        }
+#else
+        ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
+#endif
+    }
+
+    // extract logits for all N tokens
+    //logits_out.resize(n_tokens*n_vocab);
+    //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_tokens*n_vocab);
+
+    // extract logits only for the last token
+    logits_out.resize(n_vocab);
+    memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_vocab);
+
+    if (n_tokens > 1) {
+        //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
+        //        ggml_used_mem(ctx0)/1024.0/1024.0,
+        //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
+        //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
+        //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
+        //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
+    }
+
+    if (n_tokens == 1) {
+        wstate.t_decode_us += ggml_time_us() - t_start_us;
+        wstate.n_decode++;
+    } else {
+        wstate.t_prompt_us += ggml_time_us() - t_start_us;
+        wstate.n_prompt++;
+    }
+
+    return true;
+}
+
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+static std::string to_timestamp(int64_t t, bool comma = false) {
+    int64_t msec = t * 10;
+    int64_t hr = msec / (1000 * 60 * 60);
+    msec = msec - hr * (1000 * 60 * 60);
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
+
+    return std::string(buf);
+}
+
+#define SIN_COS_N_COUNT WHISPER_N_FFT
+static float sin_vals[SIN_COS_N_COUNT];
+static float cos_vals[SIN_COS_N_COUNT];
+
+// In FFT, we frequently use sine and cosine operations with the same values.
+// We can use precalculated values to speed up the process.
+static void fill_sin_cos_table() {
+    static bool is_filled = false;
+    if (is_filled) return;
+    for (int i = 0; i < SIN_COS_N_COUNT; i++) {
+        double theta = (2*M_PI*i)/SIN_COS_N_COUNT;
+        sin_vals[i] = sinf(theta);
+        cos_vals[i] = cosf(theta);
+    }
+    is_filled = true;
+}
+
+// naive Discrete Fourier Transform
+// input is real-valued
+// output is complex-valued
+static void dft(const std::vector<float> & in, std::vector<float> & out) {
+    int N = in.size();
+
+    out.resize(N*2);
+    const int sin_cos_step = SIN_COS_N_COUNT / N;
+
+    for (int k = 0; k < N; k++) {
+        float re = 0;
+        float im = 0;
+
+        for (int n = 0; n < N; n++) {
+            int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
+            re += in[n]*cos_vals[idx]; // cos(t)
+            im -= in[n]*sin_vals[idx]; // sin(t)
+        }
+
+        out[k*2 + 0] = re;
+        out[k*2 + 1] = im;
+    }
+}
+
+// Cooley-Tukey FFT
+// poor man's implementation - use something better
+// input is real-valued
+// output is complex-valued
+static void fft(const std::vector<float> & in, std::vector<float> & out) {
+    out.resize(in.size()*2);
+
+    int N = in.size();
+
+    if (N == 1) {
+        out[0] = in[0];
+        out[1] = 0;
+        return;
+    }
+
+    if (N%2 == 1) {
+        dft(in, out);
+        return;
+    }
+
+    std::vector<float> even;
+    std::vector<float> odd;
+
+    even.reserve(N/2);
+    odd.reserve(N/2);
+
+    for (int i = 0; i < N; i++) {
+        if (i % 2 == 0) {
+            even.push_back(in[i]);
+        } else {
+            odd.push_back(in[i]);
+        }
+    }
+
+    std::vector<float> even_fft;
+    std::vector<float> odd_fft;
+
+    fft(even, even_fft);
+    fft(odd, odd_fft);
+
+    const int sin_cos_step = SIN_COS_N_COUNT / N;
+    for (int k = 0; k < N/2; k++) {
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+        float re = cos_vals[idx]; // cos(t)
+        float im = -sin_vals[idx]; // sin(t)
+
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];
+
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
+
+        out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
+    }
+}
+
+static bool hann_window(int length, bool periodic, std::vector<float> & output) {
+    if (output.size() < static_cast<size_t>(length)) {
+        output.resize(length);
+    }
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        output[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(length + offset)));
+    }
+
+    return true;
+}
+
+static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> & hann, const std::vector<float> & samples,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
+                                              const whisper_filters & filters, whisper_mel & mel) {
+    std::vector<float> fft_in(frame_size, 0.0);
+    std::vector<float> fft_out(2 * frame_step);
+    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
+    int n_fft = 1 + (frame_size / 2);
+    int i = ith;
+
+    // calculate FFT only when fft_in are not all zero
+    for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
+        const int offset = i * frame_step;
+
+        // apply Hanning window (~10% faster)
+        for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
+            fft_in[j] = hann[j] * samples[offset + j];
+        }
+        // fill the rest with zeros
+        if (n_samples - offset < frame_size) {
+            std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
+        }
+
+        // FFT
+        fft(fft_in, fft_out);
+
+        // Calculate modulus^2 of complex numbers
+        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
+        for (int j = 0; j < frame_size; j++) {
+            fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
+        }
+
+        // mel spectrogram
+        for (int j = 0; j < mel.n_mel; j++) {
+            double sum = 0.0;
+
+            // unroll loop (suggested by GH user @lunixbochs)
+            int k = 0;
+            for (k = 0; k < n_fft - 3; k += 4) {
+                sum +=
+                        fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
+                        fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
+                        fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
+                        fft_out[k + 3] * filters.data[j * n_fft + k + 3];
+            }
+
+            // handle n_fft remainder
+            for (; k < n_fft; k++) {
+                sum += fft_out[k] * filters.data[j * n_fft + k];
+            }
+
+            sum = log10(std::max(sum, 1e-10));
+
+            mel.data[j * mel.n_len + i] = sum;
+        }
+    }
+
+    // Otherwise fft_out are all zero
+    double sum = log10(1e-10);
+    for (; i < mel.n_len; i += n_threads) {
+        for (int j = 0; j < mel.n_mel; j++) {
+            mel.data[j * mel.n_len + i] = sum;
+        }
+    }
+}
+
+// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
+static bool log_mel_spectrogram(
+              whisper_state & wstate,
+              const float * samples,
+              const int   n_samples,
+              const int   /*sample_rate*/,
+              const int   frame_size,
+              const int   frame_step,
+              const int   n_mel,
+              const int   n_threads,
+              const whisper_filters & filters,
+              const bool   debug,
+              whisper_mel & mel) {
+    const int64_t t_start_us = ggml_time_us();
+
+    // Hanning window (Use cosf to eliminate difference)
+    // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
+    // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
+    std::vector<float> hann;
+    hann_window(frame_size, true, hann);
+
+
+    // Calculate the length of padding
+    int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
+    int64_t stage_2_pad = frame_size / 2;
+
+    // Initialize a vector and copy data from C array to it.
+    std::vector<float> samples_padded;
+    samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
+    std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
+
+    // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
+    std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
+
+    // reflective pad 200 samples at the beginning of audio
+    std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
+
+    mel.n_mel     = n_mel;
+    // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
+    // Calculate number of frames + remove the last frame
+    mel.n_len     = (samples_padded.size() - frame_size) / frame_step;
+    // Calculate semi-padded sample length to ensure compatibility
+    mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
+    mel.data.resize(mel.n_mel * mel.n_len);
+
+
+    {
+        std::vector<std::thread> workers(n_threads - 1);
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw] = std::thread(
+                    log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples_padded,
+                    n_samples + stage_2_pad, frame_size, frame_step, n_threads,
+                    std::cref(filters), std::ref(mel));
+        }
+
+        // main thread
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
+
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw].join();
+        }
+    }
+
+    // clamping and normalization
+    double mmax = -1e20;
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] > mmax) {
+            mmax = mel.data[i];
+        }
+    }
+
+    mmax -= 8.0;
+
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] < mmax) {
+            mel.data[i] = mmax;
+        }
+
+        mel.data[i] = (mel.data[i] + 4.0)/4.0;
+    }
+
+    wstate.t_mel_us += ggml_time_us() - t_start_us;
+
+    // Dump log_mel_spectrogram
+    if (debug) {
+        std::ofstream outFile("log_mel_spectrogram.json");
+        outFile << "[";
+        for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
+            outFile << mel.data[i] << ", ";
+        }
+        outFile << mel.data[mel.data.size() - 1] << "]";
+        outFile.close();
+    }
+
+    return true;
+}
+
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, const std::string & text) {
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        std::string str = text;
+        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re)) {
+            for (auto x : m) {
+                words.push_back(x);
+            }
+            str = m.suffix();
+        }
+    }
+
+    // find the longest tokens that form the words:
+    std::vector<whisper_vocab::id> tokens;
+    for (const auto & word : words) {
+        if (word.empty()) continue;
+
+        int i = 0;
+        int n = word.size();
+        while (i < n) {
+            int j = n;
+            bool found = false;
+            while (j > i) {
+                auto sub = word.substr(i, j-i);
+                auto it = vocab.token_to_id.find(sub);
+                if (it != vocab.token_to_id.end()) {
+                    tokens.push_back(it->second);
+                    i = j;
+                    found = true;
+                    break;
+                }
+                --j;
+            }
+            if (!found) {
+                log("unknown token\n");
+                ++i;
+            }
+        }
+    }
+
+    return tokens;
+}
+
+//
+// interface implementation
+//
+
+#ifdef WHISPER_USE_COREML
+// replace .bin with -encoder.mlmodelc
+static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    // match "-qx_x"
+    pos = path_bin.rfind('-');
+    if (pos != std::string::npos) {
+        auto sub = path_bin.substr(pos);
+        if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
+            path_bin = path_bin.substr(0, pos);
+        }
+    }
+
+    path_bin += "-encoder.mlmodelc";
+
+    return path_bin;
+}
+#endif
+
+#ifdef WHISPER_USE_OPENVINO
+// replace .bin with-encoder-openvino.xml
+static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += "-encoder-openvino.xml";
+
+    return path_bin;
+}
+
+static std::string whisper_openvino_get_path_cache(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += "-encoder-openvino-cache";
+
+    return path_bin;
+}
+#endif
+
+struct whisper_state * whisper_init_state(whisper_context * ctx) {
+    fill_sin_cos_table();
+    whisper_state * state = new whisper_state;
+
+    if (!kv_cache_init(ctx->model.hparams, state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
+        log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
+        delete state;
+        return nullptr;
+    }
+
+    {
+        const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
+        log("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+    }
+
+    if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
+        log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
+        delete state;
+        return nullptr;
+    }
+
+    {
+        const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
+        log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+    }
+
+#ifdef WHISPER_USE_COREML
+    const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
+
+    log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
+    log("%s: first run on a device may take a while ...\n", __func__);
+
+    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
+    if (!state->ctx_coreml) {
+        log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+#ifndef WHISPER_COREML_ALLOW_FALLBACK
+        delete state;
+        return nullptr;
+#endif
+    } else {
+        log("%s: Core ML model loaded\n", __func__);
+    }
+#endif
+
+    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
+
+    state->logits_id.reserve(ctx->model.hparams.n_vocab);
+
+    // TAGS: WHISPER_DECODER_INIT
+    state->decoders[0].sequence.tokens.reserve(ctx->model.hparams.n_text_ctx);
+
+    state->decoders[0].probs.reserve   (ctx->vocab.n_vocab);
+    state->decoders[0].logits.reserve  (ctx->vocab.n_vocab);
+    state->decoders[0].logprobs.reserve(ctx->vocab.n_vocab);
+
+    // conv allocator
+    {
+        whisper_allocr_graph_init(state->alloc_conv,
+                [&]() {
+                    return whisper_build_graph_conv(*ctx, *state, 0);
+                });
+
+        log("%s: compute buffer (conv)   = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1024.0 / 1024.0);
+    }
+
+    // encoder allocator
+    if (!whisper_encode_external(*state)) {
+        whisper_allocr_graph_init(state->alloc_encode,
+                [&]() {
+                    return whisper_build_graph_encoder(*ctx, *state);
+                });
+
+        log("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1024.0 / 1024.0);
+    }
+
+    // cross allocator
+    {
+        whisper_allocr_graph_init(state->alloc_cross,
+                [&]() {
+                    return whisper_build_graph_cross(*ctx, *state);
+                });
+
+        log("%s: compute buffer (cross)  = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1024.0 / 1024.0);
+    }
+
+    // decoder allocator
+    {
+        whisper_allocr_graph_init(state->alloc_decode,
+                [&]() {
+                    const auto & hparams = ctx->model.hparams;
+
+                    // TODO: make sure this is the worst-case scenario
+                    const int n_tokens = hparams.n_text_ctx;
+                    const int n_past   = 0;
+
+                    return whisper_build_graph_decoder(*ctx, *state, state->decoders[0], nullptr, n_tokens, n_past);
+                });
+
+        log("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1024.0 / 1024.0);
+    }
+
+#ifdef GGML_USE_METAL
+    state->ctx_metal = ggml_metal_init(1);
+    if (!state->ctx_metal) {
+        log("%s: ggml_metal_init() failed\n", __func__);
+        delete state;
+        return nullptr;
+    }
+
+    log("%s: Metal context initialized\n", __func__);
+
+    // this allocates all Metal resources and memory buffers
+
+    void * data_ptr  = NULL;
+    size_t data_size = 0;
+
+    // TODO: add mmap support
+    //if (params.use_mmap) {
+    //    data_ptr  = ctx->model.mapping->addr;
+    //    data_size = ctx->model.mapping->size;
+    //} else {
+    //    data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
+    //    data_size = ggml_get_mem_size  (ctx->model.ctx);
+    //}
+
+    data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
+    data_size = ggml_get_mem_size  (ctx->model.ctx);
+
+    const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+
+    log("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+
+#define WHISPER_METAL_CHECK_BUF(result)              \
+    if (!(result)) {                                 \
+        log("%s: failed to add metal buffer\n", __func__); \
+        delete state;                                \
+        return nullptr;                              \
+    }
+
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data", data_ptr, data_size, max_size));
+
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_conv",   state->alloc_conv.meta.data(),   state->alloc_conv.meta.size(),   0));
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_encode", state->alloc_encode.meta.data(), state->alloc_encode.meta.size(), 0));
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_cross",  state->alloc_cross.meta.data(),  state->alloc_cross.meta.size(),  0));
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_decode", state->alloc_decode.meta.data(), state->alloc_decode.meta.size(), 0));
+
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_conv",   state->alloc_conv.data.data(),   state->alloc_conv.data.size(),   0));
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_encode", state->alloc_encode.data.data(), state->alloc_encode.data.size(), 0));
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_cross",  state->alloc_cross.data.data(),  state->alloc_cross.data.size(),  0));
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_decode", state->alloc_decode.data.data(), state->alloc_decode.data.size(), 0));
+
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_cross",  state->kv_cross.buf.data(), state->kv_cross.buf.size(), 0));
+
+    WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_self_0", state->decoders[0].kv_self.buf.data(), state->decoders[0].kv_self.buf.size(), 0));
+#undef WHISPER_METAL_CHECK_BUF
+#endif
+
+    state->rng = std::mt19937(0);
+
+    return state;
+}
+
+int whisper_ctx_init_openvino_encoder(
+        struct whisper_context * ctx,
+                    const char * model_path,
+                    const char * device,
+                    const char * cache_dir) {
+#ifndef WHISPER_USE_OPENVINO
+    (void)(ctx);
+    (void)(model_path);
+    (void)(device);
+    (void)(cache_dir);
+
+    return 1;
+#else
+    if (!model_path && ctx->path_model.empty()) {
+        log("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
+        return 1;
+    }
+
+    std::string path_encoder;
+    if (!model_path) {
+        //if model_path is not set, attempt to find it in the same directory as ggml-<model>.bin model
+        path_encoder = whisper_openvino_get_path_encoder(ctx->path_model);
+    } else {
+        path_encoder = model_path;
+    }
+
+    std::string path_cache;
+    if (!cache_dir) {
+        //if cache_dir is not set, set it as a dir residing next to ggml-<model>.bin
+        path_cache = whisper_openvino_get_path_cache(ctx->path_model);
+    } else {
+        path_cache = cache_dir;
+    }
+
+    log("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
+    log("%s: first run on a device may take a while ...\n", __func__);
+
+    ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
+    if (!ctx->state->ctx_openvino) {
+        log("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
+        return 1;
+    } else {
+        log("%s: OpenVINO model loaded\n", __func__);
+    }
+
+    return 0;
+#endif
+}
+
+struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
+    log("%s: loading model from '%s'\n", __func__, path_model);
+
+    auto fin = std::ifstream(path_model, std::ios::binary);
+    if (!fin) {
+        log("%s: failed to open '%s'\n", __func__, path_model);
+        return nullptr;
+    }
+
+    whisper_model_loader loader = {};
+
+    loader.context = &fin;
+
+    loader.read = [](void * ctx, void * output, size_t read_size) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        fin->read((char *)output, read_size);
+        return read_size;
+    };
+
+    loader.eof = [](void * ctx) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        return fin->eof();
+    };
+
+    loader.close = [](void * ctx) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        fin->close();
+    };
+
+    auto ctx = whisper_init_no_state(&loader);
+
+    if (ctx) {
+        ctx->path_model = path_model;
+    }
+
+    return ctx;
+}
+
+struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
+    struct buf_context {
+        uint8_t* buffer;
+        size_t size;
+        size_t current_offset;
+    };
+
+    buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
+
+    log("%s: loading model from buffer\n", __func__);
+
+    whisper_model_loader loader = {};
+
+    loader.context = &ctx;
+
+    loader.read = [](void * ctx, void * output, size_t read_size) {
+        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
+
+        size_t size_to_copy = buf->current_offset + read_size < buf->size ? read_size : buf->size - buf->current_offset;
+
+        memcpy(output, buf->buffer + buf->current_offset, size_to_copy);
+        buf->current_offset += size_to_copy;
+
+        return size_to_copy;
+    };
+
+    loader.eof = [](void * ctx) {
+        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
+
+        return buf->current_offset >= buf->size;
+    };
+
+    loader.close = [](void * /*ctx*/) { };
+
+    return whisper_init_no_state(&loader);
+}
+
+struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
+    ggml_time_init();
+
+    whisper_context * ctx = new whisper_context;
+
+    if (!whisper_model_load(loader, *ctx)) {
+        loader->close(loader->context);
+        log("%s: failed to load model\n", __func__);
+        delete ctx;
+        return nullptr;
+    }
+
+    loader->close(loader->context);
+
+    return ctx;
+}
+
+struct whisper_context * whisper_init_from_file(const char * path_model) {
+    whisper_context * ctx = whisper_init_from_file_no_state(path_model);
+    if (!ctx) {
+        return nullptr;
+    }
+
+    ctx->state = whisper_init_state(ctx);
+    if (!ctx->state) {
+        whisper_free(ctx);
+        return nullptr;
+    }
+
+    return ctx;
+}
+
+struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
+    whisper_context * ctx = whisper_init_from_buffer_no_state(buffer, buffer_size);
+    if (!ctx) {
+        return nullptr;
+    }
+
+    ctx->state = whisper_init_state(ctx);
+    if (!ctx->state) {
+        whisper_free(ctx);
+        return nullptr;
+    }
+
+    return ctx;
+}
+
+struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
+    whisper_context * ctx = whisper_init_no_state(loader);
+    if (!ctx) {
+        return nullptr;
+    }
+
+    ctx->state = whisper_init_state(ctx);
+    if (!ctx->state) {
+        whisper_free(ctx);
+        return nullptr;
+    }
+
+    return ctx;
+}
+
+void whisper_free_state(struct whisper_state * state)
+{
+    if (state) {
+        kv_cache_free(state->kv_cross);
+
+        for (int i = 0; i < WHISPER_MAX_DECODERS; ++i) {
+            kv_cache_free(state->decoders[i].kv_self);
+        }
+
+#ifdef WHISPER_USE_COREML
+        if (state->ctx_coreml != nullptr) {
+            whisper_coreml_free(state->ctx_coreml);
+            state->ctx_coreml = nullptr;
+        }
+#endif
+
+#ifdef GGML_USE_METAL
+        if (state->ctx_metal) {
+            ggml_metal_free(state->ctx_metal);
+            state->ctx_metal = nullptr;
+        }
+#endif
+
+#ifdef WHISPER_USE_OPENVINO
+        if (state->ctx_openvino != nullptr) {
+            whisper_openvino_free(state->ctx_openvino);
+            state->ctx_openvino = nullptr;
+        }
+#endif
+
+        whisper_allocr_free(state->alloc_conv);
+        whisper_allocr_free(state->alloc_decode);
+        whisper_allocr_free(state->alloc_cross);
+        whisper_allocr_free(state->alloc_encode);
+
+        delete state;
+    }
+}
+
+void whisper_free(struct whisper_context * ctx) {
+    if (ctx) {
+        if (ctx->model.ctx) {
+            ggml_free(ctx->model.ctx);
+        }
+        if (ctx->model.buf) {
+            delete ctx->model.buf;
+        }
+
+        whisper_free_state(ctx->state);
+
+        delete ctx;
+    }
+}
+
+void whisper_free_params(struct whisper_full_params * params) {
+    if (params) {
+        delete params;
+    }
+}
+
+int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+        log("%s: failed to compute mel spectrogram\n", __func__);
+        return -1;
+    }
+
+    return 0;
+}
+
+int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
+    return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
+}
+
+// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
+int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+        log("%s: failed to compute mel spectrogram\n", __func__);
+        return -1;
+    }
+
+    return 0;
+}
+
+// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
+int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
+    return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
+}
+
+// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
+// TODO
+
+// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
+// TODO
+
+// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
+// TODO
+
+int whisper_set_mel_with_state(
+        struct whisper_context * /*ctx*/,
+          struct whisper_state * state,
+                   const float * data,
+                           int   n_len,
+                           int   n_mel) {
+    if (n_mel != WHISPER_N_MEL) {
+        log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
+        return -1;
+    }
+
+    state->mel.n_len     = n_len;
+    state->mel.n_len_org = n_len;
+    state->mel.n_mel     = n_mel;
+
+    state->mel.data.resize(n_len*n_mel);
+    memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
+
+    return 0;
+}
+
+int whisper_set_mel(
+        struct whisper_context * ctx,
+        const float * data,
+        int n_len,
+        int n_mel) {
+    return whisper_set_mel_with_state(ctx, ctx->state, data, n_len, n_mel);
+}
+
+int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
+    if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
+        log("%s: failed to eval\n", __func__);
+        return -1;
+    }
+
+    return 0;
+}
+
+int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
+    if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
+        log("%s: failed to eval\n", __func__);
+        return -1;
+    }
+
+    return 0;
+}
+
+int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
+    const int selected_decoder_id = 0;
+
+    if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
+        log("%s: failed to eval\n", __func__);
+        return 1;
+    }
+
+    return 0;
+}
+
+int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
+    // TODO: add selected_decoder_id to state
+    const int selected_decoder_id = 0;
+
+    if (ctx->state == nullptr) {
+        log("%s: ERROR state was not loaded.\n", __func__);
+        return false;
+    }
+
+    if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
+        log("%s: failed to eval\n", __func__);
+        return 1;
+    }
+
+    return 0;
+}
+
+int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_token * tokens, int n_max_tokens) {
+    const auto res = tokenize(ctx->vocab, text);
+
+    if (n_max_tokens < (int) res.size()) {
+        log("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
+        return -1;
+    }
+
+    for (int i = 0; i < (int) res.size(); i++) {
+        tokens[i] = res[i];
+    }
+
+    return res.size();
+}
+
+int whisper_lang_max_id() {
+    auto max_id = 0;
+    for (const auto & kv : g_lang) {
+        max_id = std::max(max_id, kv.second.first);
+    }
+
+    return max_id;
+}
+
+int whisper_lang_id(const char * lang) {
+    if (!g_lang.count(lang)) {
+        for (const auto & kv : g_lang) {
+            if (kv.second.second == lang) {
+                return kv.second.first;
+            }
+        }
+
+        log("%s: unknown language '%s'\n", __func__, lang);
+        return -1;
+    }
+    return g_lang.at(lang).first;
+}
+
+const char * whisper_lang_str(int id) {
+    for (const auto & kv : g_lang) {
+        if (kv.second.first == id) {
+            return kv.first.c_str();
+        }
+    }
+
+    log("%s: unknown language id %d\n", __func__, id);
+    return nullptr;
+}
+
+int whisper_lang_auto_detect_with_state(
+        struct whisper_context * ctx,
+          struct whisper_state * state,
+                           int   offset_ms,
+                           int   n_threads,
+                         float * lang_probs) {
+    const int seek = offset_ms/10;
+
+    if (seek < 0) {
+        log("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
+        return -1;
+    }
+
+    if (seek >= state->mel.n_len_org) {
+        log("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
+        return -2;
+    }
+
+    // run the encoder
+    if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
+        log("%s: failed to encode\n", __func__);
+        return -6;
+    }
+
+    const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
+
+    if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
+        log("%s: failed to decode\n", __func__);
+        return -7;
+    }
+
+    auto & logits_id = state->logits_id;
+    logits_id.clear();
+
+    for (const auto & kv : g_lang) {
+        const auto token_lang = whisper_token_lang(ctx, kv.second.first);
+        logits_id.emplace_back(state->logits[token_lang], kv.second.first);
+    }
+
+    // sort descending
+    {
+        using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
+        std::sort(logits_id.begin(), logits_id.end(), [](const pair_type & a, const pair_type & b) {
+            return a.first > b.first;
+        });
+    }
+
+    // softmax
+    {
+        const auto max = logits_id[0].first;
+
+        double sum = 0.0f;
+        for (auto & kv : logits_id) {
+            kv.first = exp(kv.first - max);
+            sum += kv.first;
+        }
+
+        for (auto & kv : logits_id) {
+            kv.first /= sum;
+        }
+    }
+
+    {
+        for (const auto & prob : logits_id) {
+            if (lang_probs) {
+                lang_probs[prob.second] = prob.first;
+            }
+
+            //printf("%s: lang %2d (%3s): %f\n", __func__, prob.second, whisper_lang_str(prob.second), prob.first);
+        }
+    }
+
+    return logits_id[0].second;
+}
+
+int whisper_lang_auto_detect(
+        struct whisper_context * ctx,
+                           int   offset_ms,
+                           int   n_threads,
+                         float * lang_probs) {
+    return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
+}
+
+int whisper_model_n_vocab(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_vocab;
+}
+
+int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_ctx;
+}
+
+int whisper_model_n_audio_state(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_state;
+}
+
+int whisper_model_n_audio_head(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_head;
+}
+
+int whisper_model_n_audio_layer(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_layer;
+}
+
+int whisper_model_n_text_ctx(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_ctx;
+}
+
+int whisper_model_n_text_state(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_state;
+}
+
+int whisper_model_n_text_head(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_head;
+}
+
+int whisper_model_n_text_layer(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_layer;
+}
+
+int whisper_model_n_mels(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_mels;
+}
+
+int whisper_model_ftype(struct whisper_context * ctx) {
+    return ctx->model.hparams.ftype;
+}
+
+int whisper_model_type(struct whisper_context * ctx) {
+    return ctx->model.type;
+}
+
+const char *whisper_model_type_readable(struct whisper_context * ctx) {
+    switch (ctx->model.type) {
+    case e_model::MODEL_TINY:
+        return "tiny";
+    case e_model::MODEL_BASE:
+        return "base";
+    case e_model::MODEL_SMALL:
+        return "small";
+    case e_model::MODEL_MEDIUM:
+        return "medium";
+    case e_model::MODEL_LARGE:
+        return "large";
+    default:
+        return "unknown";
+    }
+}
+
+int whisper_n_len_from_state(struct whisper_state * state) {
+    return state->mel.n_len_org;
+}
+
+int whisper_n_len(struct whisper_context * ctx) {
+    return ctx->state->mel.n_len_org;
+}
+
+int whisper_n_vocab(struct whisper_context * ctx) {
+    return ctx->vocab.n_vocab;
+}
+
+int whisper_n_text_ctx(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_ctx;
+}
+
+int whisper_n_audio_ctx(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_ctx;
+}
+
+int whisper_is_multilingual(struct whisper_context * ctx) {
+    return ctx->vocab.is_multilingual() ? 1 : 0;
+}
+
+float * whisper_get_logits(struct whisper_context * ctx) {
+    return ctx->state->logits.data();
+}
+
+float * whisper_get_logits_from_state(struct whisper_state * state) {
+    return state->logits.data();
+}
+
+const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token) {
+    return ctx->vocab.id_to_token.at(token).c_str();
+}
+
+whisper_token whisper_token_eot(struct whisper_context * ctx) {
+    return ctx->vocab.token_eot;
+}
+
+whisper_token whisper_token_sot(struct whisper_context * ctx) {
+    return ctx->vocab.token_sot;
+}
+
+whisper_token whisper_token_solm(struct whisper_context * ctx) {
+    return ctx->vocab.token_solm;
+}
+
+whisper_token whisper_token_prev(struct whisper_context * ctx) {
+    return ctx->vocab.token_prev;
+}
+
+whisper_token whisper_token_nosp(struct whisper_context * ctx) {
+    return ctx->vocab.token_nosp;
+}
+
+whisper_token whisper_token_not(struct whisper_context * ctx) {
+    return ctx->vocab.token_not;
+}
+
+whisper_token whisper_token_beg(struct whisper_context * ctx) {
+    return ctx->vocab.token_beg;
+}
+
+whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
+    return whisper_token_sot(ctx) + 1 + lang_id;
+}
+
+whisper_token whisper_token_translate(struct whisper_context * ctx) {
+    return ctx->vocab.token_translate;
+}
+
+whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
+    return ctx->vocab.token_transcribe;
+}
+
+void whisper_print_timings(struct whisper_context * ctx) {
+    const int64_t t_end_us = ggml_time_us();
+
+    log("\n");
+    log("%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
+    if (ctx->state != nullptr) {
+
+        const int32_t n_sample = std::max(1, ctx->state->n_sample);
+        const int32_t n_encode = std::max(1, ctx->state->n_encode);
+        const int32_t n_decode = std::max(1, ctx->state->n_decode);
+        const int32_t n_prompt = std::max(1, ctx->state->n_prompt);
+
+        log("%s:     fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
+        log("%s:      mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
+        log("%s:   sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
+        log("%s:   encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
+        log("%s:   decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
+        log("%s:   prompt time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_prompt_us, n_prompt, 1e-3f * ctx->state->t_prompt_us / n_prompt);
+    }
+    log("%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+}
+
+void whisper_reset_timings(struct whisper_context * ctx) {
+    if (ctx->state != nullptr) {
+        ctx->state->t_sample_us = 0;
+        ctx->state->t_encode_us = 0;
+        ctx->state->t_decode_us = 0;
+        ctx->state->t_prompt_us = 0;
+        ctx->state->n_sample = 0;
+        ctx->state->n_encode = 0;
+        ctx->state->n_decode = 0;
+        ctx->state->n_prompt = 0;
+    }
+}
+
+static int whisper_has_coreml(void) {
+#ifdef WHISPER_USE_COREML
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+static int whisper_has_openvino(void) {
+#ifdef WHISPER_USE_OPENVINO
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+const char * whisper_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
+    s += "METAL = "     + std::to_string(ggml_cpu_has_metal())     + " | ";
+    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
+    s += "SSSE3 = "     + std::to_string(ggml_cpu_has_ssse3())     + " | ";
+    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+    s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
+    s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";
+
+    return s.c_str();
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy) {
+    struct whisper_full_params params = whisper_full_default_params(strategy);
+
+    struct whisper_full_params* result = new whisper_full_params();
+    *result = params;
+    return result;
+}
+
+struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
+    struct whisper_full_params result = {
+        /*.strategy          =*/ strategy,
+
+        /*.n_threads         =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
+        /*.n_max_text_ctx    =*/ 16384,
+        /*.offset_ms         =*/ 0,
+        /*.duration_ms       =*/ 0,
+
+        /*.translate         =*/ false,
+        /*.no_context        =*/ true,
+        /*.single_segment    =*/ false,
+        /*.print_special     =*/ false,
+        /*.print_progress    =*/ true,
+        /*.print_realtime    =*/ false,
+        /*.print_timestamps  =*/ true,
+
+        /*.token_timestamps  =*/ false,
+        /*.thold_pt          =*/ 0.01f,
+        /*.thold_ptsum       =*/ 0.01f,
+        /*.max_len           =*/ 0,
+        /*.split_on_word     =*/ false,
+        /*.max_tokens        =*/ 0,
+
+        /*.speed_up          =*/ false,
+        /*.debug_mode        =*/ false,
+        /*.audio_ctx         =*/ 0,
+
+        /*.tdrz_enable       =*/ false,
+
+        /*.initial_prompt    =*/ nullptr,
+        /*.prompt_tokens     =*/ nullptr,
+        /*.prompt_n_tokens   =*/ 0,
+
+        /*.language          =*/ "en",
+        /*.detect_language   =*/ false,
+
+        /*.suppress_blank    =*/ true,
+        /*.suppress_non_speech_tokens =*/ false,
+
+        /*.temperature       =*/  0.0f,
+        /*.max_initial_ts    =*/  1.0f,
+        /*.length_penalty    =*/ -1.0f,
+
+        /*.temperature_inc   =*/  0.4f,
+        /*.entropy_thold     =*/  2.4f,
+        /*.logprob_thold     =*/ -1.0f,
+        /*.no_speech_thold   =*/  0.6f,
+
+        /*.greedy            =*/ {
+            /*.best_of   =*/ -1,
+        },
+
+        /*.beam_search      =*/ {
+            /*.beam_size =*/ -1,
+
+            /*.patience  =*/ -1.0f,
+        },
+
+        /*.new_segment_callback           =*/ nullptr,
+        /*.new_segment_callback_user_data =*/ nullptr,
+
+        /*.progress_callback           =*/ nullptr,
+        /*.progress_callback_user_data =*/ nullptr,
+
+        /*.encoder_begin_callback           =*/ nullptr,
+        /*.encoder_begin_callback_user_data =*/ nullptr,
+
+        /*.logits_filter_callback           =*/ nullptr,
+        /*.logits_filter_callback_user_data =*/ nullptr,
+    };
+
+    switch (strategy) {
+        case WHISPER_SAMPLING_GREEDY:
+            {
+                result.greedy = {
+                    /*.best_of   =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
+                };
+            } break;
+        case WHISPER_SAMPLING_BEAM_SEARCH:
+            {
+                result.beam_search = {
+                    /*.beam_size =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
+
+                    /*.patience  =*/ -1.0f,
+                };
+            } break;
+    }
+
+    return result;
+}
+
+// forward declarations
+static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window);
+static void whisper_exp_compute_token_level_timestamps(
+        struct whisper_context & ctx,
+          struct whisper_state & state,
+                           int   i_segment,
+                         float   thold_pt,
+                         float   thold_ptsum);
+
+static inline bool should_split_on_word(const char * txt, bool split_on_word) {
+    if (!split_on_word) return true;
+
+    return txt[0] == ' ';
+}
+
+// wrap the last segment to max_len characters
+// returns the number of new segments
+static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_state & state, int max_len, bool split_on_word) {
+    auto segment = state.result_all.back();
+
+    int res = 1;
+    int acc = 0;
+
+    std::string text;
+
+    for (int i = 0; i < (int) segment.tokens.size(); i++) {
+        const auto & token = segment.tokens[i];
+        if (token.id >= whisper_token_eot(&ctx)) {
+            continue;
+        }
+
+        const auto txt = whisper_token_to_str(&ctx, token.id);
+        const int cur = strlen(txt);
+
+        if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
+            state.result_all.back().text = std::move(text);
+            state.result_all.back().t1 = token.t0;
+            state.result_all.back().tokens.resize(i);
+            state.result_all.back().speaker_turn_next = false;
+
+            state.result_all.push_back({});
+            state.result_all.back().t0 = token.t0;
+            state.result_all.back().t1 = segment.t1;
+
+            // add tokens [i, end] to the new segment
+            state.result_all.back().tokens.insert(
+                state.result_all.back().tokens.end(),
+                    segment.tokens.begin() + i,
+                    segment.tokens.end());
+
+            state.result_all.back().speaker_turn_next = segment.speaker_turn_next;
+
+            acc = 0;
+            text = "";
+
+            segment = state.result_all.back();
+            i = -1;
+
+            res++;
+        } else {
+            acc += cur;
+            text += txt;
+        }
+    }
+
+    state.result_all.back().text = std::move(text);
+
+    return res;
+}
+
+static const std::vector<std::string> non_speech_tokens = {
+    "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
+    "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
+    "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
+    "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
+};
+
+// process the logits for the selected decoder
+// - applies logit filters
+// - computes logprobs and probs
+static void whisper_process_logits(
+              struct whisper_context & ctx,
+               struct whisper_state  & state,
+    const struct whisper_full_params   params,
+              struct whisper_decoder & decoder,
+                               float   temperature) {
+    const auto & vocab      = ctx.vocab;
+    const auto & tokens_cur = decoder.sequence.tokens;
+
+    const bool is_initial = tokens_cur.size() == 0;
+    const int  n_logits   = vocab.id_to_token.size();
+
+    WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab);
+
+    // extract the logits for the last token
+    // we will be mutating, and therefore we don't want to use the ctx.logits buffer directly
+    auto & probs    = decoder.probs;
+    auto & logits   = decoder.logits;
+    auto & logprobs = decoder.logprobs;
+    {
+        logits.resize(n_logits);
+        memcpy(logits.data(), state.logits.data() + (state.logits.size() - n_logits), n_logits*sizeof(float));
+
+        if (temperature > 0.0f) {
+            for (int i = 0; i < n_logits; i++) {
+                logits[i] /= temperature;
+            }
+        }
+
+        // will be populated a bit later
+        probs.resize(n_logits);
+        logprobs.resize(n_logits);
+    }
+
+    // apply logit filters here
+    // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L480-L493
+    {
+        // suppress blank
+        // https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L388-L390
+        if (params.suppress_blank) {
+            if (is_initial) {
+                logits[vocab.token_eot]           = -INFINITY;
+                logits[vocab.token_to_id.at(" ")] = -INFINITY;
+            }
+        }
+
+        // suppress <|notimestamps|> token
+        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
+        logits[vocab.token_not] = -INFINITY;
+
+        // suppress sot and nosp tokens
+        logits[vocab.token_sot]  = -INFINITY;
+        logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
+
+        // [TDRZ] when tinydiarize is disabled, suppress solm token
+        if (params.tdrz_enable == false) {
+            logits[vocab.token_solm] = -INFINITY;
+        }
+
+        // suppress task tokens
+        logits[vocab.token_translate]  = -INFINITY;
+        logits[vocab.token_transcribe] = -INFINITY;
+
+        if (params.logits_filter_callback) {
+            params.logits_filter_callback(&ctx, &state, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
+        }
+
+        // suppress non-speech tokens
+        // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
+        if (params.suppress_non_speech_tokens) {
+            for (const std::string & token : non_speech_tokens) {
+                const std::string suppress_tokens[] = {token, " " + token};
+                for (const std::string & suppress_token : suppress_tokens) {
+                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
+                        logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
+                    }
+                }
+            }
+
+            // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
+                logits[vocab.token_to_id.at(" -")] = -INFINITY;
+            }
+            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
+                logits[vocab.token_to_id.at(" '")] = -INFINITY;
+            }
+        }
+
+        // timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
+        // https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L414-L424
+        {
+            const bool last_was_timestamp        = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
+            const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
+
+            //log("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
+
+            if (last_was_timestamp) {
+                if (penultimate_was_timestamp) {
+                    for (int i = vocab.token_beg; i < n_logits; ++i) {
+                        logits[i] = -INFINITY;
+                    }
+                } else {
+                    for (int i = 0; i < vocab.token_eot; ++i) {
+                        logits[i] = -INFINITY;
+                    }
+                }
+            }
+        }
+
+        // the initial timestamp cannot be larger than max_initial_ts
+        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
+        if (is_initial && params.max_initial_ts > 0.0f) {
+            const float precision = float(WHISPER_CHUNK_SIZE)/ctx.model.hparams.n_audio_ctx;
+            const int   tid0      = std::round(params.max_initial_ts/precision);
+
+            for (int i = vocab.token_beg + tid0 + 1; i < n_logits; ++i) {
+                logits[i] = -INFINITY;
+            }
+        }
+
+        // condition timestamp tokens to be increasing
+        // ref: https://github.com/openai/whisper/pull/831#issuecomment-1385910556
+        if (decoder.has_ts) {
+            const int tid0 = decoder.seek_delta/2;
+
+            for (int i = vocab.token_beg; i < vocab.token_beg + tid0; ++i) {
+                logits[i] = -INFINITY;
+            }
+        }
+
+        // populate the logprobs array (log_softmax)
+        {
+            const float logit_max = *std::max_element(logits.begin(), logits.end());
+            float logsumexp = 0.0f;
+            for (int i = 0; i < n_logits; ++i) {
+                if (logits[i] > -INFINITY) {
+                    logsumexp += expf(logits[i] - logit_max);
+                }
+            }
+            logsumexp = logf(logsumexp) + logit_max;
+
+            for (int i = 0; i < n_logits; ++i) {
+                if (logits[i] > -INFINITY) {
+                    logprobs[i] = logits[i] - logsumexp;
+                } else {
+                    logprobs[i] = -INFINITY;
+                }
+            }
+        }
+
+        // if sum of probability over timestamps is above any other token, sample timestamp
+        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L431-L437
+        {
+            // logsumexp over timestamps
+            float timestamp_logprob = -INFINITY;
+            {
+                float logsumexp = 0.0f;
+                const float logprob_max = *std::max_element(logprobs.begin() + vocab.token_beg, logprobs.end());
+                for (int i = vocab.token_beg; i < n_logits; ++i) {
+                    if (logprobs[i] > -INFINITY) {
+                        logsumexp += expf(logprobs[i] - logprob_max);
+                    }
+                }
+                if (logsumexp > 0.0f) {
+                    timestamp_logprob = logf(logsumexp) + logprob_max;
+                }
+            }
+
+            const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
+
+            //log("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
+
+            if (timestamp_logprob > max_text_token_logprob) {
+                for (int i = 0; i < vocab.token_beg; ++i) {
+                    logits[i]   = -INFINITY;
+                    logprobs[i] = -INFINITY;
+                }
+            }
+        }
+    }
+
+    // compute probs
+    {
+        for (int i = 0; i < n_logits; ++i) {
+            if (logits[i] == -INFINITY) {
+                probs[i] = 0.0f;
+            } else {
+                probs[i] = expf(logprobs[i]);
+            }
+        }
+    }
+
+#if 0
+    // print first 100 logits - token string : logit
+    for (int i = 0; i < 100; i++) {
+        const auto token   = vocab.id_to_token.at(i);
+        const auto prob    = probs[i];
+        const auto logit   = logits[i];
+        const auto logprob = logprobs[i];
+        printf("%s : prob=%9.5f logit=%9.5f logprob=%9.5f\n", token.c_str(), prob, logit, logprob);
+    }
+
+    // "And", "and", " And", " and"
+    printf("logits[\"and\"]  = %f\n", logits[vocab.token_to_id.at("and")]);
+    printf("logits[\"And\"]  = %f\n", logits[vocab.token_to_id.at("And")]);
+    printf("logits[\" and\"] = %f\n", logits[vocab.token_to_id.at(" and")]);
+    printf("logits[\" And\"] = %f\n", logits[vocab.token_to_id.at(" And")]);
+    printf("logits[\" so\"]  = %f\n", logits[vocab.token_to_id.at(" so")]);
+
+    printf("logprobs[\"and\"]  = %f\n", logprobs[vocab.token_to_id.at("and")]);
+    printf("logprobs[\"And\"]  = %f\n", logprobs[vocab.token_to_id.at("And")]);
+    printf("logprobs[\" and\"] = %f\n", logprobs[vocab.token_to_id.at(" and")]);
+    printf("logprobs[\" And\"] = %f\n", logprobs[vocab.token_to_id.at(" And")]);
+    printf("logprobs[\" so\"]  = %f\n", logprobs[vocab.token_to_id.at(" so")]);
+
+    printf("probs[\"and\"]  = %f\n", probs[vocab.token_to_id.at("and")]);
+    printf("probs[\"And\"]  = %f\n", probs[vocab.token_to_id.at("And")]);
+    printf("probs[\" and\"] = %f\n", probs[vocab.token_to_id.at(" and")]);
+    printf("probs[\" And\"] = %f\n", probs[vocab.token_to_id.at(" And")]);
+    printf("probs[\" so\"]  = %f\n", probs[vocab.token_to_id.at(" so")]);
+#endif
+}
+
+static whisper_token_data whisper_sample_token(
+            whisper_context & ctx,
+              whisper_state & state,
+      const whisper_decoder & decoder,
+                       bool   best) {
+    whisper_token_data result = {
+        0, 0, 0.0f, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
+    };
+
+    const auto & vocab = ctx.vocab;
+
+    const auto & probs    = decoder.probs;
+    const auto & logprobs = decoder.logprobs;
+
+    const int n_logits = vocab.n_vocab;
+
+    {
+        double sum_ts = 0.0;
+        double max_ts = 0.0;
+
+        for (int i = vocab.token_beg; i < n_logits; i++) {
+            if (probs[i] == -INFINITY) {
+                continue;
+            }
+
+            sum_ts += probs[i];
+            if (max_ts < probs[i]) {
+                max_ts = probs[i];
+                result.tid = i;
+            }
+        }
+
+        result.pt    = max_ts/(sum_ts + 1e-10);
+        result.ptsum = sum_ts;
+    }
+
+    if (best) {
+        for (int i = 0; i < n_logits; ++i) {
+            if (result.p < probs[i]) {
+                result.id   = i;
+                result.p    = probs[i];
+                result.plog = logprobs[i];
+            }
+        }
+    } else {
+        std::discrete_distribution<> dist(probs.begin(), probs.end());
+
+        result.id   = dist(state.rng);
+        result.p    = probs[result.id];
+        result.plog = logprobs[result.id];
+    }
+
+    if (result.id >= vocab.token_beg) {
+        result.tid = result.id;
+        result.pt  = result.p;
+    }
+
+    state.n_sample++;
+
+    return result;
+}
+
+static std::vector<whisper_token_data> whisper_sample_token_topk(
+            whisper_context & ctx,
+              whisper_state & state,
+      const whisper_decoder & decoder,
+                        int   k) {
+    const auto & vocab = ctx.vocab;
+
+    const auto & probs    = decoder.probs;
+    const auto & logits   = decoder.logits;
+    const auto & logprobs = decoder.logprobs;
+
+    const int n_logits = vocab.n_vocab;
+
+    auto & logits_id = state.logits_id;
+
+    logits_id.resize(n_logits);
+    for (int i = 0; i < n_logits; ++i) {
+        logits_id[i].first = logits[i];
+        logits_id[i].second = i;
+    }
+
+    {
+        using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
+        std::partial_sort(
+                logits_id.begin(),
+                logits_id.begin() + k, logits_id.end(),
+                [](const pair_type & a, const pair_type & b) {
+            return a.first > b.first;
+        });
+    }
+
+    std::vector<whisper_token_data> result;
+    result.reserve(k);
+
+    whisper_token tid = vocab.token_beg;
+
+    float pt    = 0.0;
+    float ptsum = 0.0;
+
+    {
+        double sum_ts = 0.0;
+        double max_ts = 0.0;
+
+        for (int i = vocab.token_beg; i < n_logits; i++) {
+            if (probs[i] == -INFINITY) {
+                continue;
+            }
+
+            sum_ts += probs[i];
+            if (max_ts < probs[i]) {
+                max_ts = probs[i];
+                tid = i;
+            }
+        }
+
+        pt    = max_ts/(sum_ts + 1e-10);
+        ptsum = sum_ts;
+    }
+
+    for (int i = 0; i < k; ++i) {
+        const auto id = logits_id[i].second;
+
+        result.push_back({ id, tid, probs[id], logprobs[id], pt, ptsum, -1, -1, 0.0f, });
+
+        if (result[i].id >= vocab.token_beg) {
+            result[i].tid = result[i].id;
+            result[i].pt  = result[i].p;
+        }
+    }
+
+    state.n_sample++;
+
+    return result;
+}
+
+// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L178-L192
+static void whisper_sequence_score(
+        const struct whisper_full_params & params,
+                        whisper_sequence & sequence) {
+    if (sequence.result_len == 0) {
+        return;
+    }
+
+    double result = 0.0f;
+
+    for (int i = 0; i < sequence.result_len; ++i) {
+        result += sequence.tokens[i].plog;
+    }
+
+    sequence.sum_logprobs = result;
+    sequence.avg_logprobs = result/sequence.result_len;
+
+    double penalty = sequence.result_len;
+
+    if (params.length_penalty > 0.0f) {
+        penalty = pow((5.0 + penalty)/6.0, params.length_penalty);
+    }
+
+    sequence.score = result/penalty;
+
+    // compute the entropy of the sequence of the last 32 tokens
+    {
+        const int n = 32;
+
+        int cnt = 0;
+        double entropy = 0.0f;
+
+        std::map<whisper_token, int> token_counts;
+        for (int i = std::max(0, sequence.result_len - n); i < sequence.result_len; ++i) {
+            token_counts[sequence.tokens[i].id]++;
+            cnt++;
+        }
+
+        for (const auto & kv : token_counts) {
+            const auto p = kv.second/(double)cnt;
+            entropy -= p*log(p);
+
+            //WHISPER_PRINT_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
+        }
+
+        sequence.entropy = entropy;
+    }
+}
+
+static bool whisper_kv_swap_fast(
+                   std::vector<int> & view,
+                    whisper_decoder   src[],
+                std::vector<kv_buf> & kv_swap_bufs,
+                          const int & n_decoders) {
+    WHISPER_PRINT_DEBUG("%s: n_decoders %d\n", __func__, n_decoders);
+
+    // (decoder->buffer->decoder or decoder->buffer + decoder->decoder)
+    std::set<int> two_copy; // decoder indices require two copies to safely modify KV caches
+
+    // (buffer->decoder or decoder->decoder)
+    std::set<int> one_copy; // decoder indices require one copy to safely modify KV caches
+
+    // (decoder<->decoder)
+    std::set<int> p_swap_set; // decoder indices able to swap KV-cache pointers
+    std::vector<whisper_pair<int, int>> p_swap_vec;
+    p_swap_vec.reserve(n_decoders);
+
+    // see https://github.com/ggerganov/whisper.cpp/wiki
+    for (int i = 0; i < n_decoders; i++) {
+        // zero-copy (no modification)
+        if (i == view[i] || view[i] < 0) {
+            continue;
+        }
+
+        bool is_one_copy = true;
+        // since we modify data sequentially, we only consider decoder indices after current index
+        for (int j = i + 1; j < n_decoders; j++) {
+            if (i == view[j]) {
+                // detect symmetric diagram
+                if (j == view[i]) {
+                    p_swap_set.insert(i);
+                    p_swap_set.insert(j);
+                    p_swap_vec.emplace_back(i, j);
+                } else {
+                    two_copy.insert(i);
+                    is_one_copy = false;
+                }
+                break;
+            }
+        }
+        if (is_one_copy) {
+            one_copy.insert(i);
+        }
+    }
+
+    kv_swap_bufs.resize(n_decoders);
+
+    for (int i = 0; i < n_decoders; i++) {
+        kv_swap_bufs[i].k.resize(ggml_nbytes(src[i].kv_self.k));
+        kv_swap_bufs[i].v.resize(ggml_nbytes(src[i].kv_self.v));
+    }
+
+    for (auto & i : two_copy) {
+        // make a copy of KV caches
+        WHISPER_PRINT_DEBUG("%s: store KV cache into swap: idx %d\n", __func__, i);
+        memcpy(kv_swap_bufs[i].k.data(), src[i].kv_self.k->data, kv_swap_bufs[i].k.size());
+        memcpy(kv_swap_bufs[i].v.data(), src[i].kv_self.v->data, kv_swap_bufs[i].v.size());
+    }
+
+    // since two-copy decoder KV caches are protected by kv_swap_bufs, modify them first
+    for (auto & i : two_copy) {
+        // skip the decoder indices that require pointer swapping
+        if (p_swap_set.find(i) != p_swap_set.end()) {
+            continue;
+        }
+
+        if (two_copy.find(view[i]) != two_copy.end()) {
+            // modify KV caches of decoder using data from kv_swap_bufs
+            WHISPER_PRINT_DEBUG("%s: two-copy decoder using   swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
+            memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
+        } else {
+            // modify KV caches of decoder using data from correspond decoder KV caches directly
+            WHISPER_PRINT_DEBUG("%s: two-copy decoder without swap buffers:      %d  -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
+            memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
+        }
+    }
+
+    // then modify one-copy decoder KV caches
+    for (auto & i : one_copy) {
+        // skip the decoder indices that require pointer swapping
+        if (p_swap_set.find(i) != p_swap_set.end()) {
+            continue;
+        }
+
+        if (two_copy.find(view[i]) != two_copy.end()) {
+            // modify KV caches of decoder using data from kv_swap_bufs
+            WHISPER_PRINT_DEBUG("%s: one-copy decoder using   swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
+            memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
+        } else {
+            // modify KV caches of decoder using data from correspond decoder KV caches directly
+            WHISPER_PRINT_DEBUG("%s: one-copy decoder without swap buffers:      %d  -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
+            memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
+        }
+    }
+
+    // swap the pointers
+    for (auto & i : p_swap_vec) {
+        WHISPER_PRINT_DEBUG("%s: swap pointers: %d <-> %d\n", __func__, i.first, i.second);
+        std::swap(src[i.first].kv_self, src[i.second].kv_self);
+    }
+
+    return true;
+}
+
+int whisper_full_with_state(
+        struct whisper_context * ctx,
+          struct whisper_state * state,
+    struct whisper_full_params   params,
+                   const float * samples,
+                           int   n_samples) {
+    // clear old results
+    auto & result_all = state->result_all;
+
+    result_all.clear();
+
+    if (n_samples > 0) {
+        // compute log mel spectrogram
+        if (params.speed_up) {
+            // TODO: Replace PV with more advanced algorithm
+            log("%s: failed to compute log mel spectrogram\n", __func__);
+            return -1;
+        } else {
+            if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
+                log("%s: failed to compute log mel spectrogram\n", __func__);
+                return -2;
+            }
+        }
+    }
+
+    // auto-detect language if not specified
+    if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) {
+        std::vector<float> probs(whisper_lang_max_id() + 1, 0.0f);
+
+        const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
+        if (lang_id < 0) {
+            log("%s: failed to auto-detect language\n", __func__);
+            return -3;
+        }
+        state->lang_id = lang_id;
+        params.language = whisper_lang_str(lang_id);
+
+        log("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
+        if (params.detect_language) {
+            return 0;
+        }
+    }
+
+    if (params.token_timestamps) {
+        state->t_beg    = 0;
+        state->t_last   = 0;
+        state->tid_last = 0;
+        if (n_samples > 0) {
+            state->energy = get_signal_energy(samples, n_samples, 32);
+        }
+    }
+
+    const int seek_start = params.offset_ms/10;
+    const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
+
+    // if length of spectrogram is less than 1.0s (100 frames), then return
+    // basically don't process anything that is less than 1.0s
+    // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
+    if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
+        return 0;
+    }
+
+    // a set of temperatures to use
+    // [ t0, t0 + delta, t0 + 2*delta, ..., < 1.0f + 1e-6f ]
+    std::vector<float> temperatures;
+    if (params.temperature_inc > 0.0f) {
+        for (float t = params.temperature; t < 1.0f + 1e-6f; t += params.temperature_inc) {
+            temperatures.push_back(t);
+        }
+    } else {
+        temperatures.push_back(params.temperature);
+    }
+
+    // initialize the decoders
+    int n_decoders = 1;
+
+    switch (params.strategy) {
+        case WHISPER_SAMPLING_GREEDY:
+            {
+                n_decoders = params.greedy.best_of;
+            } break;
+        case WHISPER_SAMPLING_BEAM_SEARCH:
+            {
+                n_decoders = std::max(params.greedy.best_of, params.beam_search.beam_size);
+            } break;
+    };
+
+    n_decoders = std::max(1, n_decoders);
+
+    // TAGS: WHISPER_DECODER_INIT
+    for (int j = 1; j < n_decoders; j++) {
+        auto & decoder = state->decoders[j];
+
+        if (decoder.kv_self.ctx == nullptr) {
+            decoder.kv_self = state->decoders[0].kv_self;
+            if (!kv_cache_reinit(decoder.kv_self)) {
+                log("%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
+                return -4;
+            }
+
+            WHISPER_PRINT_DEBUG("%s: initialized self-attention kv cache, decoder %d\n", __func__, j);
+
+            decoder.sequence.tokens.reserve(state->decoders[0].sequence.tokens.capacity());
+
+            decoder.probs.resize   (ctx->vocab.n_vocab);
+            decoder.logits.resize  (ctx->vocab.n_vocab);
+            decoder.logprobs.resize(ctx->vocab.n_vocab);
+
+            // TODO: not very clean - look for a better way and potentially merging with the init of decoder 0
+#ifdef GGML_USE_METAL
+#define WHISPER_METAL_CHECK_BUF(result)              \
+            if (!(result)) {                                 \
+                log("%s: failed to add metal buffer\n", __func__); \
+                return 0;                              \
+            }
+
+            const std::string kv_name = "kv_self_" + std::to_string(j);
+            auto & kv_self = decoder.kv_self;
+
+            WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, kv_name.c_str(), kv_self.buf.data(), kv_self.buf.size(), 0));
+#undef WHISPER_METAL_CHECK_BUF
+#endif
+        }
+    }
+
+    // the accumulated text context so far
+    auto & prompt_past = state->prompt_past;
+    if (params.no_context) {
+        prompt_past.clear();
+    }
+
+    // prepare prompt
+    {
+        std::vector<whisper_token> prompt_tokens;
+
+        // initial prompt
+        if (!params.prompt_tokens && params.initial_prompt) {
+            prompt_tokens.resize(1024);
+            prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
+            params.prompt_tokens   = prompt_tokens.data();
+            params.prompt_n_tokens = prompt_tokens.size();
+        }
+
+        // prepend the prompt tokens to the prompt_past
+        if (params.prompt_tokens && params.prompt_n_tokens > 0) {
+            // parse tokens from the pointer
+            for (int i = 0; i < params.prompt_n_tokens; i++) {
+                prompt_past.push_back(params.prompt_tokens[i]);
+            }
+            std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
+        }
+    }
+
+    // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
+    if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
+        log("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
+        return -5;
+    }
+    state->exp_n_audio_ctx = params.audio_ctx;
+
+    // these tokens determine the task that will be performed
+    std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
+    if (whisper_is_multilingual(ctx)) {
+        const int lang_id = whisper_lang_id(params.language);
+        state->lang_id = lang_id;
+        prompt_init.push_back(whisper_token_lang(ctx, lang_id));
+        if (params.translate) {
+            prompt_init.push_back(whisper_token_translate(ctx));
+        } else {
+            prompt_init.push_back(whisper_token_transcribe(ctx));
+        }
+    }
+
+    int seek = seek_start;
+
+    std::vector<whisper_token> prompt;
+    prompt.reserve(whisper_n_text_ctx(ctx));
+
+    struct beam_candidate {
+        int decoder_idx;
+        int seek_delta;
+
+        bool has_ts;
+
+        whisper_sequence sequence;
+    };
+
+    std::vector<beam_candidate> beam_candidates;
+
+    // main loop
+    while (true) {
+        if (params.progress_callback) {
+            const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
+
+            params.progress_callback(
+                ctx, ctx->state, progress_cur, params.progress_callback_user_data);
+        }
+
+        // of only 1 second left, then stop
+        if (seek + 100 >= seek_end) {
+            break;
+        }
+
+        if (params.encoder_begin_callback) {
+            if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
+                log("%s: encoder_begin_callback returned false - aborting\n", __func__);
+                break;
+            }
+        }
+
+        // encode audio features starting at offset seek
+        if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
+            log("%s: failed to encode\n", __func__);
+            return -6;
+        }
+
+        // if there is a very short audio segment left to process, we remove any past prompt since it tends
+        // to confuse the decoder and often make it repeat or hallucinate stuff
+        if (seek > seek_start && seek + 500 >= seek_end) {
+            prompt_past.clear();
+        }
+
+        int best_decoder_id = 0;
+
+        for (int it = 0; it < (int) temperatures.size(); ++it) {
+            const float t_cur = temperatures[it];
+
+            int n_decoders_cur = 1;
+
+            switch (params.strategy) {
+                case whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY:
+                    {
+                        if (t_cur > 0.0f) {
+                            n_decoders_cur = params.greedy.best_of;
+                        }
+                    } break;
+                case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
+                    {
+                        if (t_cur > 0.0f) {
+                            n_decoders_cur = params.greedy.best_of;
+                        } else {
+                            n_decoders_cur = params.beam_search.beam_size;
+                        }
+                    } break;
+            };
+
+            n_decoders_cur = std::max(1, n_decoders_cur);
+
+            WHISPER_PRINT_DEBUG("\n%s: decoding with %d decoders, temperature = %.2f\n", __func__, n_decoders_cur, t_cur);
+
+            // TAGS: WHISPER_DECODER_INIT
+            for (int j = 0; j < n_decoders_cur; ++j) {
+                auto & decoder = state->decoders[j];
+
+                decoder.kv_self.n = 0;
+
+                decoder.sequence.tokens.clear();
+                decoder.sequence.result_len       = 0;
+                decoder.sequence.sum_logprobs_all = 0.0;
+                decoder.sequence.sum_logprobs     = -INFINITY;
+                decoder.sequence.avg_logprobs     = -INFINITY;
+                decoder.sequence.entropy          = 0.0;
+                decoder.sequence.score            = -INFINITY;
+
+                decoder.seek_delta = 100*WHISPER_CHUNK_SIZE;
+
+                decoder.failed    = false;
+                decoder.completed = false;
+                decoder.has_ts    = false;
+            }
+
+            // init prompt and kv cache for the current iteration
+            // run whisper_decoder() only for decoder 0 and copy the results for the other decoders
+            {
+                prompt.clear();
+
+                // if we have already generated some text, use it as a prompt to condition the next generation
+                if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) {
+                    int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
+
+                    prompt = { whisper_token_prev(ctx) };
+                    prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end());
+                }
+
+                // init new transcription with sot, language (opt) and task tokens
+                prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
+
+                // print the prompt
+                WHISPER_PRINT_DEBUG("\n\n");
+                for (int i = 0; i < (int) prompt.size(); i++) {
+                    WHISPER_PRINT_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
+                }
+                WHISPER_PRINT_DEBUG("\n\n");
+
+                if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
+                    log("%s: failed to decode\n", __func__);
+                    return -7;
+                }
+
+                {
+                    const int64_t t_start_sample_us = ggml_time_us();
+
+                    whisper_process_logits(*ctx, *state, params, state->decoders[0], t_cur);
+
+                    state->decoders[0].kv_self.n += prompt.size();
+
+                    for (int j = 1; j < n_decoders_cur; ++j) {
+                        auto & decoder = state->decoders[j];
+
+                        memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k));
+                        memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v));
+
+                        decoder.kv_self.n += prompt.size();
+
+                        memcpy(decoder.probs.data(),    state->decoders[0].probs.data(),    decoder.probs.size()*sizeof(decoder.probs[0]));
+                        memcpy(decoder.logits.data(),   state->decoders[0].logits.data(),   decoder.logits.size()*sizeof(decoder.logits[0]));
+                        memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
+                    }
+
+                    state->t_sample_us += ggml_time_us() - t_start_sample_us;
+                }
+            }
+
+            for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
+                    beam_candidates.clear();
+                }
+
+                // generate new sequence candidates for each decoder
+                for (int j = 0; j < n_decoders_cur; ++j) {
+                    auto & decoder = state->decoders[j];
+
+                    if (decoder.completed || decoder.failed) {
+                        continue;
+                    }
+
+                    switch (params.strategy) {
+                        case whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY:
+                            {
+                                if (t_cur < 1e-6f) {
+                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, *state, decoder, true));
+                                } else {
+                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, *state, decoder, false));
+                                }
+
+                                decoder.sequence.sum_logprobs_all += decoder.sequence.tokens.back().plog;
+                            } break;
+                        case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
+                            {
+                                const auto tokens_new = whisper_sample_token_topk(*ctx, *state, decoder, params.beam_search.beam_size);
+
+                                for (const auto & token : tokens_new) {
+                                    beam_candidates.push_back({ j, decoder.seek_delta, decoder.has_ts, decoder.sequence });
+                                    beam_candidates.back().sequence.tokens.push_back(token);
+                                    beam_candidates.back().sequence.sum_logprobs_all += token.plog;
+
+                                    //WHISPER_PRINT_DEBUG("%s: beam candidate: %s (%f, %f)\n", __func__, ctx->vocab.id_to_token.at(token.id).c_str(), token.plog, beam_candidates.back().sequence.sum_logprobs_all);
+                                }
+                            } break;
+                    };
+                }
+
+                // for beam-search, choose the top candidates and update the KV caches
+                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
+                    std::sort(
+                            beam_candidates.begin(),
+                            beam_candidates.end(),
+                            [](const beam_candidate & a, const beam_candidate & b) {
+                        return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
+                    });
+
+                    uint32_t cur_c = 0;
+                    std::vector<int> decoder_idx(n_decoders_cur, -1);
+
+                    for (int j = 0; j < n_decoders_cur; ++j) {
+                        auto & decoder = state->decoders[j];
+
+                        if (decoder.completed || decoder.failed) {
+                            continue;
+                        }
+
+                        auto & cur = beam_candidates[cur_c++];
+
+                        while (beam_candidates.size() > cur_c && beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
+                            ++cur_c;
+                        }
+
+                        decoder.sequence   = cur.sequence;
+                        decoder.seek_delta = cur.seek_delta;
+                        decoder.has_ts     = cur.has_ts;
+
+                        decoder_idx[j] = cur.decoder_idx;
+                        WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
+                                __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
+                    }
+
+                    // update KV caches
+                    whisper_kv_swap_fast(decoder_idx, state->decoders, state->kv_swap_bufs, n_decoders_cur);
+                }
+
+                // update the decoder state
+                // - check if the sequence is completed
+                // - check if the sequence is failed
+                // - update sliding window based on timestamp tokens
+                for (int j = 0; j < n_decoders_cur; ++j) {
+                    auto & decoder = state->decoders[j];
+
+                    if (decoder.completed || decoder.failed) {
+                        continue;
+                    }
+
+                    auto & has_ts     = decoder.has_ts;
+                    auto & failed     = decoder.failed;
+                    auto & completed  = decoder.completed;
+                    auto & seek_delta = decoder.seek_delta;
+                    auto & result_len = decoder.sequence.result_len;
+
+                    {
+                        const auto & token = decoder.sequence.tokens.back();
+
+                        // timestamp token - update sliding window
+                        if (token.id > whisper_token_beg(ctx)) {
+                            const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
+
+                            // do not allow to go back in time
+                            if (has_ts && seek_delta > seek_delta_new && result_len < i) {
+                                failed = true; // TODO: maybe this is not a failure ?
+                                continue;
+                            }
+
+                            seek_delta = seek_delta_new;
+                            result_len = i + 1;
+                            has_ts = true;
+                        }
+
+#ifdef WHISPER_DEBUG
+                        {
+                            const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
+                            WHISPER_PRINT_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
+                                    __func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
+                        }
+#endif
+
+                        // end of segment
+                        if (token.id == whisper_token_eot(ctx) ||               // end of text token
+                           (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
+                           (has_ts && seek + seek_delta + 100 >= seek_end)      // end of audio reached
+                           ) {
+                            if (result_len == 0) {
+                                if (seek + seek_delta + 100 >= seek_end) {
+                                    result_len = i + 1;
+                                } else {
+                                    failed = true;
+                                    continue;
+                                }
+                            }
+
+                            if (params.single_segment) {
+                                result_len = i + 1;
+                                seek_delta = 100*WHISPER_CHUNK_SIZE;
+                            }
+
+                            completed = true;
+                            continue;
+                        }
+
+                        // TESTS: if no tensors are loaded, it means we are running tests
+                        if (ctx->model.n_loaded == 0) {
+                            seek_delta = 100*WHISPER_CHUNK_SIZE;
+                            completed = true;
+                            continue;
+                        }
+                    }
+
+                    // sometimes, the decoding can get stuck in a repetition loop
+                    // this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
+                    if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
+                        failed = true;
+                        continue;
+                    }
+                }
+
+                // check if all decoders have finished (i.e. completed or failed)
+                {
+                    bool completed_all = true;
+
+                    for (int j = 0; j < n_decoders_cur; ++j) {
+                        auto & decoder = state->decoders[j];
+
+                        if (decoder.completed || decoder.failed) {
+                            continue;
+                        }
+
+                        completed_all = false;
+                    }
+
+                    if (completed_all) {
+                        break;
+                    }
+                }
+
+                state->t_sample_us += ggml_time_us() - t_start_sample_us;
+
+                // obtain logits for the next token
+                for (int j = 0; j < n_decoders_cur; ++j) {
+                    auto & decoder = state->decoders[j];
+
+                    if (decoder.failed || decoder.completed) {
+                        continue;
+                    }
+
+                    decoder.tokens_tmp.resize(1);
+                    decoder.tokens_tmp[0] = decoder.sequence.tokens.back().id;
+
+                    //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
+
+                    if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
+                        log("%s: failed to decode\n", __func__);
+                        return -8;
+                    }
+
+                    {
+                        const int64_t t_start_sample_us = ggml_time_us();
+
+                        whisper_process_logits(*ctx, *state, params, decoder, t_cur);
+
+                        ++decoder.kv_self.n;
+
+                        state->t_sample_us += ggml_time_us() - t_start_sample_us;
+                    }
+                }
+            }
+
+            // rank the resulting sequences and select the best one
+            {
+                double best_score = -INFINITY;
+
+                for (int j = 0; j < n_decoders_cur; ++j) {
+                    auto & decoder = state->decoders[j];
+
+                    if (decoder.failed) {
+                        continue;
+                    }
+
+                    decoder.sequence.tokens.resize(decoder.sequence.result_len);
+                    whisper_sequence_score(params, decoder.sequence);
+
+                    WHISPER_PRINT_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
+                            __func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
+
+                    if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
+                        WHISPER_PRINT_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
+                                __func__, j, decoder.sequence.entropy, params.entropy_thold);
+
+                        decoder.failed = true;
+                        state->n_fail_h++;
+
+                        continue;
+                    }
+
+                    if (best_score < decoder.sequence.score) {
+                        best_score = decoder.sequence.score;
+                        best_decoder_id = j;
+                    }
+                }
+
+                WHISPER_PRINT_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
+            }
+
+            // was the decoding successful for the current temperature?
+            // do fallback only if:
+            // - we are not at the last temperature
+            // - we are not at the end of the audio (3 sec)
+            if (it != (int) temperatures.size() - 1 &&
+                seek_end - seek > 10*WHISPER_CHUNK_SIZE) {
+                bool success = true;
+
+                const auto & decoder = state->decoders[best_decoder_id];
+
+                if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
+                    success = false;
+                    state->n_fail_p++;
+                }
+
+                if (success) {
+                    //for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
+                    //    WHISPER_PRINT_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
+                    //}
+
+                    break;
+                }
+            }
+
+            WHISPER_PRINT_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
+        }
+
+        // output results through a user-provided callback
+        {
+            const auto & best_decoder = state->decoders[best_decoder_id];
+
+            const auto seek_delta = best_decoder.seek_delta;
+            const auto result_len = best_decoder.sequence.result_len;
+
+            const auto & tokens_cur = best_decoder.sequence.tokens;
+
+            //WHISPER_PRINT_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
+
+            // update prompt_past
+            prompt_past.clear();
+            if (prompt.front() == whisper_token_prev(ctx)) {
+                prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
+            }
+
+            for (int i = 0; i < result_len; ++i) {
+                prompt_past.push_back(tokens_cur[i].id);
+            }
+
+            if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
+                int  i0 = 0;
+                auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
+
+                std::string text;
+                bool speaker_turn_next = false;
+
+                for (int i = 0; i < (int) tokens_cur.size(); i++) {
+                    //printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
+                    //        ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
+                    //        ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
+
+                    if (params.print_special || tokens_cur[i].id < whisper_token_eot(ctx)) {
+                        text += whisper_token_to_str(ctx, tokens_cur[i].id);
+                    }
+
+                    // [TDRZ] record if speaker turn was predicted after current segment
+                    if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) {
+                        speaker_turn_next = true;
+                    }
+
+                    if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
+                        const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
+
+                        if (!text.empty()) {
+                            const auto tt0 = params.speed_up ? 2*t0 : t0;
+                            const auto tt1 = params.speed_up ? 2*t1 : t1;
+
+                            if (params.print_realtime) {
+                                if (params.print_timestamps) {
+                                    printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
+                                } else {
+                                    printf("%s", text.c_str());
+                                    fflush(stdout);
+                                }
+                            }
+
+                            //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
+
+                            result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
+                            for (int j = i0; j <= i; j++) {
+                                result_all.back().tokens.push_back(tokens_cur[j]);
+                            }
+
+                            int n_new = 1;
+
+                            if (params.token_timestamps) {
+                                whisper_exp_compute_token_level_timestamps(
+                                        *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
+
+                                if (params.max_len > 0) {
+                                    n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
+                                }
+                            }
+                            if (params.new_segment_callback) {
+                                params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
+                            }
+                        }
+                        text = "";
+                        while (i < (int) tokens_cur.size() && tokens_cur[i].id > whisper_token_beg(ctx)) {
+                            i++;
+                        }
+                        i--;
+                        t0 = t1;
+                        i0 = i + 1;
+                        speaker_turn_next = false;
+                    }
+                }
+
+                if (!text.empty()) {
+                    const auto t1 = seek + seek_delta;
+
+                    const auto tt0 = params.speed_up ? 2*t0 : t0;
+                    const auto tt1 = params.speed_up ? 2*t1 : t1;
+
+                    if (params.print_realtime) {
+                        if (params.print_timestamps) {
+                            printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
+                        } else {
+                            printf("%s", text.c_str());
+                            fflush(stdout);
+                        }
+                    }
+
+                    result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
+                    for (int j = i0; j < (int) tokens_cur.size(); j++) {
+                        result_all.back().tokens.push_back(tokens_cur[j]);
+                    }
+
+                    int n_new = 1;
+
+                    if (params.token_timestamps) {
+                        whisper_exp_compute_token_level_timestamps(
+                                *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
+
+                        if (params.max_len > 0) {
+                            n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
+                        }
+                    }
+                    if (params.new_segment_callback) {
+                        params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
+                    }
+                }
+            }
+
+            // update audio window
+            seek += seek_delta;
+
+            WHISPER_PRINT_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
+        }
+    }
+
+    return 0;
+}
+
+int whisper_full(
+        struct whisper_context * ctx,
+    struct whisper_full_params   params,
+                   const float * samples,
+                           int   n_samples) {
+    return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
+}
+
+int whisper_full_parallel(
+        struct whisper_context * ctx,
+        struct whisper_full_params params,
+        const float * samples,
+        int n_samples,
+        int n_processors) {
+    if (n_processors == 1) {
+        return whisper_full(ctx, params, samples, n_samples);
+    }
+    int ret = 0;
+
+    // prepare separate states for each thread
+    std::vector<whisper_state*> states;
+
+    const int offset_samples = (WHISPER_SAMPLE_RATE*params.offset_ms)/1000;
+    const int n_samples_per_processor = (n_samples - offset_samples)/n_processors;
+
+    // the calling thread will process the first chunk
+    // while the other threads will process the remaining chunks
+
+    std::vector<std::thread> workers(n_processors - 1);
+    for (int i = 0; i < n_processors - 1; ++i) {
+        // create a new state for each thread
+        states.push_back(whisper_init_state(ctx));
+
+        const int start_samples = offset_samples + (i + 1)*n_samples_per_processor;
+        const int n_samples_cur = (i == n_processors - 2) ? n_samples - start_samples : n_samples_per_processor;
+
+        auto params_cur = params;
+
+        params_cur.offset_ms = 0;
+        params_cur.print_progress = false;
+        params_cur.print_realtime = false;
+
+        params_cur.new_segment_callback = nullptr;
+        params_cur.new_segment_callback_user_data = nullptr;
+
+        params_cur.progress_callback = nullptr;
+        params_cur.progress_callback_user_data = nullptr;
+
+        workers[i] = std::thread(whisper_full_with_state, ctx, states[i], std::move(params_cur), samples + start_samples, n_samples_cur);
+    }
+
+    {
+        auto params_cur = params;
+
+        // We need to disable the print real-time for this one as well, otherwise it will show only for the first chunk.
+        params_cur.print_realtime = false;
+
+        // Run the first transformation using default state but only for the first chunk.
+        ret = whisper_full_with_state(ctx, ctx->state, std::move(params_cur), samples, offset_samples + n_samples_per_processor);
+    }
+
+    for (int i = 0; i < n_processors - 1; ++i) {
+        workers[i].join();
+    }
+
+    const int64_t offset_t = (int64_t) params.offset_ms/10.0;
+
+    // combine results into result_state->result_all from all other states
+    for (int i = 0; i < n_processors - 1; ++i) {
+        auto& results_i = states[i]->result_all;
+
+        for (auto& result : results_i) {
+            // correct the segment timestamp taking into account the offset
+            result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
+            result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
+
+            // make sure that segments are not overlapping
+            if (!ctx->state->result_all.empty()) {
+                result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
+            }
+
+            ctx->state->result_all.push_back(std::move(result));
+
+            // call the new_segment_callback for each segment
+            if (params.new_segment_callback) {
+                params.new_segment_callback(ctx, ctx->state, 1, params.new_segment_callback_user_data);
+            }
+        }
+
+        ctx->state->t_mel_us += states[i]->t_mel_us;
+
+        ctx->state->t_sample_us += states[i]->t_sample_us;
+        ctx->state->t_encode_us += states[i]->t_encode_us;
+        ctx->state->t_decode_us += states[i]->t_decode_us;
+        ctx->state->t_prompt_us += states[i]->t_prompt_us;
+
+        ctx->state->n_sample += states[i]->n_sample;
+        ctx->state->n_encode += states[i]->n_encode;
+        ctx->state->n_decode += states[i]->n_decode;
+        ctx->state->n_prompt += states[i]->n_prompt;
+
+        whisper_free_state(states[i]);
+    }
+
+    // average the timings
+    ctx->state->t_mel_us    /= n_processors;
+    ctx->state->t_sample_us /= n_processors;
+    ctx->state->t_encode_us /= n_processors;
+    ctx->state->t_decode_us /= n_processors;
+
+    // print information about the audio boundaries
+    log("\n");
+    log("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
+    for (int i = 0; i < n_processors - 1; ++i) {
+        log("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
+    }
+    log("%s: the transcription quality may be degraded near these boundaries\n", __func__);
+
+    return ret;
+}
+
+int whisper_full_n_segments_from_state(struct whisper_state * state) {
+    return state->result_all.size();
+}
+
+int whisper_full_n_segments(struct whisper_context * ctx) {
+    return ctx->state->result_all.size();
+}
+
+int whisper_full_lang_id_from_state(struct whisper_state * state) {
+    return state->lang_id;
+}
+
+int whisper_full_lang_id(struct whisper_context * ctx) {
+    return ctx->state->lang_id;
+}
+
+int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].t0;
+}
+
+int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
+    return ctx->state->result_all[i_segment].t0;
+}
+
+int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].t1;
+}
+
+int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
+    return ctx->state->result_all[i_segment].t1;
+}
+
+bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
+    return ctx->state->result_all[i_segment].speaker_turn_next;
+}
+
+const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].text.c_str();
+}
+
+const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment) {
+    return ctx->state->result_all[i_segment].text.c_str();
+}
+
+int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].tokens.size();
+}
+
+int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment) {
+    return ctx->state->result_all[i_segment].tokens.size();
+}
+
+const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token) {
+    return ctx->vocab.id_to_token[state->result_all[i_segment].tokens[i_token].id].c_str();
+}
+
+const char* whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token) {
+    return ctx->vocab.id_to_token[ctx->state->result_all[i_segment].tokens[i_token].id].c_str();
+}
+
+whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    return state->result_all[i_segment].tokens[i_token].id;
+}
+
+whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segment, int i_token) {
+    return ctx->state->result_all[i_segment].tokens[i_token].id;
+}
+
+struct whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    return state->result_all[i_segment].tokens[i_token];
+}
+
+struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) {
+    return ctx->state->result_all[i_segment].tokens[i_token];
+}
+
+float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    return state->result_all[i_segment].tokens[i_token].p;
+}
+
+float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token) {
+    return ctx->state->result_all[i_segment].tokens[i_token].p;
+}
+
+// =================================================================================================
+
+//
+// Temporary interface needed for exposing ggml interface
+// Will be removed in the future when ggml becomes a separate library
+//
+
+WHISPER_API int whisper_bench_memcpy(int n_threads) {
+    fputs(whisper_bench_memcpy_str(n_threads), stderr);
+    return 0;
+}
+
+WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
+    static std::string s;
+    s = "";
+    char strbuf[256];
+
+    ggml_time_init();
+
+    size_t n    = 20;
+    size_t arr  = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
+
+    // 1GB MB array
+    const size_t size = arr*1024llu*1024llu;
+
+    // single-thread
+    {
+        char * src = (char *) malloc(size);
+        char * dst = (char *) malloc(size);
+
+        for (size_t i = 0; i < size; i++) src[i] = i;
+
+        memcpy(dst, src, size); // heat-up
+
+        double tsum = 0.0;
+        double sum  = 0.0;
+
+        for (size_t i = 0; i < n; i++) {
+            const int64_t t0 = ggml_time_us();
+
+            memcpy(dst, src, size);
+
+            const int64_t t1 = ggml_time_us();
+
+            tsum += (t1 - t0)*1e-6;
+
+            src[rand() % size] = rand() % 256;
+        }
+
+        snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
+        s += strbuf;
+
+        // needed to prevent the compiler from optimizing the memcpy away
+        {
+            for (size_t i = 0; i < size; i++) sum += dst[i];
+
+            snprintf(strbuf, sizeof(strbuf), "sum:    %f\n", sum);
+            s += strbuf;
+        }
+
+        free(src);
+        free(dst);
+    }
+
+    return s.c_str();
+}
+
+WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
+    fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
+    return 0;
+}
+
+WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
+    static std::string s;
+    s = "";
+    char strbuf[256];
+
+    ggml_time_init();
+
+    const int n_max = 128;
+
+    const std::vector<size_t> sizes = {
+        64, 128, 256, 512, 1024, 2048, 4096,
+    };
+
+    const size_t N_max = sizes.back();
+
+    // a: N*N*sizeof(float)
+    // b: N*N*sizeof(float)
+    // c: N*N*sizeof(float)
+    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
+    std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
+    std::vector<uint8_t> work;
+
+    // put a bunch of random data in the buffer
+    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
+
+    for (int j = 0; j < (int) sizes.size(); j++) {
+        int n_q4_0 = 0;
+        int n_q4_1 = 0;
+        int n_q5_0 = 0;
+        int n_q5_1 = 0;
+        int n_q8_0 = 0;
+        int n_fp16 = 0;
+        int n_fp32 = 0;
+
+        // GFLOPS/s
+        double s_q4_0 = 0.0;
+        double s_q4_1 = 0.0;
+        double s_q5_0 = 0.0;
+        double s_q5_1 = 0.0;
+        double s_q8_0 = 0.0;
+        double s_fp16 = 0.0;
+        double s_fp32 = 0.0;
+
+        const size_t N = sizes[j];
+
+        for (int k = 0; k < 7; ++k) {
+            const ggml_type wtype =
+                k == 0 ? GGML_TYPE_Q4_0 :
+                k == 1 ? GGML_TYPE_Q4_1 :
+                k == 2 ? GGML_TYPE_Q5_0 :
+                k == 3 ? GGML_TYPE_Q5_1 :
+                k == 4 ? GGML_TYPE_Q8_0 :
+                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;
+
+            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
+            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
+
+            struct ggml_init_params gparams = {
+                /*.mem_size   =*/ buf.size(),
+                /*.mem_buffer =*/ buf.data(),
+                /*.no_alloc   =*/ false,
+            };
+
+            struct ggml_context * ctx0 = ggml_init(gparams);
+
+            struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
+            struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
+
+            struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
+
+            struct ggml_cgraph gf = ggml_build_forward(c);
+
+            double tsum = 0.0;
+
+            // heat-up
+            ggml_graph_compute_helper(work, &gf, n_threads);
+
+            for (int i = 0; i < n_max; ++i) {
+                const int64_t t0 = ggml_time_us();
+
+                ggml_graph_compute_helper(work, &gf, n_threads);
+
+                const int64_t t1 = ggml_time_us();
+
+                tsum += (t1 - t0)*1e-6;
+                n++;
+
+                if (tsum > 1.0 && n >= 3) {
+                    break;
+                }
+            }
+
+            ggml_free(ctx0);
+
+            s = ((2.0*N*N*N*n)/tsum)*1e-9;
+        }
+
+        // Q4_0 | Q4_1
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
+        s += strbuf;
+
+        // Q5_0 | Q5_1 | Q8_0
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
+        s += strbuf;
+
+        // F16 | F32
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16  %7.1f GFLOPS (%3d runs) | F32  %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+        s += strbuf;
+    }
+
+    return s.c_str();
+}
+
+// =================================================================================================
+
+// =================================================================================================
+
+//
+// Experimental stuff below
+//
+// Not sure if these should be part of the library at all, because the quality of the results is not
+// guaranteed. Might get removed at some point unless a robust algorithm implementation is found
+//
+
+// =================================================================================================
+
+//
+// token-level timestamps
+//
+
+static int timestamp_to_sample(int64_t t, int n_samples) {
+    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
+}
+
+static int64_t sample_to_timestamp(int i_sample) {
+    return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
+}
+
+// a cost-function / heuristic that is high for text that takes longer to pronounce
+// obviously, can be improved
+static float voice_length(const std::string & text) {
+    float res = 0.0f;
+
+    for (char c : text) {
+        if (c == ' ') {
+            res += 0.01f;
+        } else if (c == ',') {
+            res += 2.00f;
+        } else if (c == '.') {
+            res += 3.00f;
+        } else if (c == '!') {
+            res += 3.00f;
+        } else if (c == '?') {
+            res += 3.00f;
+        } else if (c >= '0' && c <= '9') {
+            res += 3.00f;
+        } else {
+            res += 1.00f;
+        }
+    }
+
+    return res;
+}
+
+// average the fabs of the signal
+static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window) {
+    const int hw = n_samples_per_half_window;
+
+    std::vector<float> result(n_samples);
+
+    for (int i = 0; i < n_samples; i++) {
+        float sum = 0;
+        for (int j = -hw; j <= hw; j++) {
+            if (i + j >= 0 && i + j < n_samples) {
+                sum += fabs(signal[i + j]);
+            }
+        }
+        result[i] = sum/(2*hw + 1);
+    }
+
+    return result;
+}
+
+static void whisper_exp_compute_token_level_timestamps(
+        struct whisper_context & ctx,
+          struct whisper_state & state,
+                           int   i_segment,
+                         float   thold_pt,
+                         float   thold_ptsum) {
+    auto & segment = state.result_all[i_segment];
+    auto & tokens  = segment.tokens;
+
+    const int n_samples = state.energy.size();
+
+    if (n_samples == 0) {
+        log("%s: no signal data available\n", __func__);
+        return;
+    }
+
+    const int64_t t0 = segment.t0;
+    const int64_t t1 = segment.t1;
+
+    const int n = tokens.size();
+
+    if (n == 0) {
+        return;
+    }
+
+    if (n == 1) {
+        tokens[0].t0 = t0;
+        tokens[0].t1 = t1;
+
+        return;
+    }
+
+    auto & t_beg    = state.t_beg;
+    auto & t_last   = state.t_last;
+    auto & tid_last = state.tid_last;
+
+    for (int j = 0; j < n; ++j) {
+        auto & token = tokens[j];
+
+        if (j == 0) {
+            if (token.id == whisper_token_beg(&ctx)) {
+                tokens[j    ].t0 = t0;
+                tokens[j    ].t1 = t0;
+                tokens[j + 1].t0 = t0;
+
+                t_beg    = t0;
+                t_last   = t0;
+                tid_last = whisper_token_beg(&ctx);
+            } else {
+                tokens[j    ].t0 = t_last;
+            }
+        }
+
+        const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(&ctx));
+
+        tokens[j].id    = token.id;
+        tokens[j].tid   = token.tid;
+        tokens[j].p     = token.p;
+        tokens[j].pt    = token.pt;
+        tokens[j].ptsum = token.ptsum;
+
+        tokens[j].vlen = voice_length(whisper_token_to_str(&ctx, token.id));
+
+        if (token.pt > thold_pt && token.ptsum > thold_ptsum && token.tid > tid_last && tt <= t1) {
+            if (j > 0) {
+                tokens[j - 1].t1 = tt;
+            }
+            tokens[j].t0 = tt;
+            tid_last = token.tid;
+        }
+    }
+
+    tokens[n - 2].t1 = t1;
+    tokens[n - 1].t0 = t1;
+    tokens[n - 1].t1 = t1;
+
+    t_last = t1;
+
+    // find intervals of tokens with unknown timestamps
+    // fill the timestamps by proportionally splitting the interval based on the token voice lengths
+    {
+        int p0 = 0;
+        int p1 = 0;
+
+        while (true) {
+            while (p1 < n && tokens[p1].t1 < 0) {
+                p1++;
+            }
+
+            if (p1 >= n) {
+                p1--;
+            }
+
+            //printf("p0=%d p1=%d t0=%lld t1=%lld\n", p0, p1, tokens[p0].t0, tokens[p1].t1);
+
+            if (p1 > p0) {
+                double psum = 0.0;
+                for (int j = p0; j <= p1; j++) {
+                    psum += tokens[j].vlen;
+                }
+
+                //printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
+
+                const double dt = tokens[p1].t1 - tokens[p0].t0;
+
+                // split the time proportionally to the voice length
+                for (int j = p0 + 1; j <= p1; j++) {
+                    const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
+
+                    tokens[j - 1].t1 = ct;
+                    tokens[j    ].t0 = ct;
+                }
+            }
+
+            p1++;
+            p0 = p1;
+            if (p1 >= n) {
+                break;
+            }
+        }
+    }
+
+    // fix up (just in case)
+    for (int j = 0; j < n - 1; j++) {
+        if (tokens[j].t1 < 0) {
+            tokens[j + 1].t0 = tokens[j].t1;
+        }
+
+        if (j > 0) {
+            if (tokens[j - 1].t1 > tokens[j].t0) {
+                tokens[j].t0 = tokens[j - 1].t1;
+                tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
+            }
+        }
+    }
+
+    // VAD
+    // expand or contract tokens based on voice activity
+    {
+        const int hw = WHISPER_SAMPLE_RATE/8;
+
+        for (int j = 0; j < n; j++) {
+            if (tokens[j].id >= whisper_token_eot(&ctx)) {
+                continue;
+            }
+
+            int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
+            int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
+
+            const int ss0 = std::max(s0 - hw, 0);
+            const int ss1 = std::min(s1 + hw, n_samples);
+
+            const int ns = ss1 - ss0;
+
+            float sum = 0.0f;
+
+            for (int k = ss0; k < ss1; k++) {
+                sum += state.energy[k];
+            }
+
+            const float thold = 0.5*sum/ns;
+
+            {
+                int k = s0;
+                if (state.energy[k] > thold && j > 0) {
+                    while (k > 0 && state.energy[k] > thold) {
+                        k--;
+                    }
+                    tokens[j].t0 = sample_to_timestamp(k);
+                    if (tokens[j].t0 < tokens[j - 1].t1) {
+                        tokens[j].t0 = tokens[j - 1].t1;
+                    } else {
+                        s0 = k;
+                    }
+                } else {
+                    while (state.energy[k] < thold && k < s1) {
+                        k++;
+                    }
+                    s0 = k;
+                    tokens[j].t0 = sample_to_timestamp(k);
+                }
+            }
+
+            {
+                int k = s1;
+                if (state.energy[k] > thold) {
+                    while (k < n_samples - 1 && state.energy[k] > thold) {
+                        k++;
+                    }
+                    tokens[j].t1 = sample_to_timestamp(k);
+                    if (j < ns - 1 && tokens[j].t1 > tokens[j + 1].t0) {
+                        tokens[j].t1 = tokens[j + 1].t0;
+                    } else {
+                        s1 = k;
+                    }
+                } else {
+                    while (state.energy[k] < thold && k > s0) {
+                        k--;
+                    }
+                    s1 = k;
+                    tokens[j].t1 = sample_to_timestamp(k);
+                }
+            }
+        }
+    }
+
+    // fixed token expand (optional)
+    //{
+    //    const int t_expand = 0;
+
+    //    for (int j = 0; j < n; j++) {
+    //        if (j > 0) {
+    //            tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
+    //        }
+    //        if (j < n - 1) {
+    //            tokens[j].t1 = tokens[j].t1 + t_expand;
+    //        }
+    //    }
+    //}
+
+    // debug info
+    //for (int j = 0; j < n; ++j) {
+    //    const auto & token = tokens[j];
+    //    const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(&ctx, token.tid) : "[?]";
+    //    printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
+    //            tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(&ctx, token.id));
+
+    //    if (tokens[j].id >= whisper_token_eot(&ctx)) {
+    //        continue;
+    //    }
+    //}
+}
+
+void whisper_set_log_callback(whisper_log_callback callback) {
+    whisper_log = callback;
+}
diff --git a/stable-diffusion.cpp/ggml/examples/whisper/whisper.h b/stable-diffusion.cpp/ggml/examples/whisper/whisper.h
new file mode 100644
index 0000000000000000000000000000000000000000..73ab4d799a23ad73bf9a6406fd9e503116bb8917
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/examples/whisper/whisper.h
@@ -0,0 +1,531 @@
+#ifndef WHISPER_H
+#define WHISPER_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef WHISPER_SHARED
+#    ifdef _WIN32
+#        ifdef WHISPER_BUILD
+#            define WHISPER_API __declspec(dllexport)
+#        else
+#            define WHISPER_API __declspec(dllimport)
+#        endif
+#    else
+#        define WHISPER_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define WHISPER_API
+#endif
+
+#define WHISPER_SAMPLE_RATE 16000
+#define WHISPER_N_FFT       400
+#define WHISPER_N_MEL       80
+#define WHISPER_HOP_LENGTH  160
+#define WHISPER_CHUNK_SIZE  30
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    //
+    // C interface
+    //
+    // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
+    // concurrently.
+    //
+    // Basic usage:
+    //
+    //     #include "whisper.h"
+    //
+    //     ...
+    //
+    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
+    //
+    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+    //         fprintf(stderr, "failed to process audio\n");
+    //         return 7;
+    //     }
+    //
+    //     const int n_segments = whisper_full_n_segments(ctx);
+    //     for (int i = 0; i < n_segments; ++i) {
+    //         const char * text = whisper_full_get_segment_text(ctx, i);
+    //         printf("%s", text);
+    //     }
+    //
+    //     whisper_free(ctx);
+    //
+    //     ...
+    //
+    // This is a demonstration of the most straightforward usage of the library.
+    // "pcmf32" contains the RAW audio data in 32-bit floating point format.
+    //
+    // The interface also allows for more fine-grained control over the computation, but it requires a deeper
+    // understanding of how the model works.
+    //
+
+    struct whisper_context;
+    struct whisper_state;
+    struct whisper_full_params;
+
+    typedef int whisper_token;
+
+    typedef struct whisper_token_data {
+        whisper_token id;  // token id
+        whisper_token tid; // forced timestamp token id
+
+        float p;           // probability of the token
+        float plog;        // log probability of the token
+        float pt;          // probability of the timestamp token
+        float ptsum;       // sum of probabilities of all timestamp tokens
+
+        // token-level timestamp data
+        // do not use if you haven't computed token-level timestamps
+        int64_t t0;        // start time of the token
+        int64_t t1;        //   end time of the token
+
+        float vlen;        // voice length of the token
+    } whisper_token_data;
+
+    typedef struct whisper_model_loader {
+        void * context;
+
+        size_t (*read)(void * ctx, void * output, size_t read_size);
+        bool    (*eof)(void * ctx);
+        void  (*close)(void * ctx);
+    } whisper_model_loader;
+
+    // Various functions for loading a ggml whisper model.
+    // Allocate (almost) all memory needed for the model.
+    // Return NULL on failure
+    WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
+    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
+
+    // These are the same as the above, but the internal state of the context is not allocated automatically
+    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
+    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
+    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
+
+    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
+
+    // Given a context, enable use of OpenVINO for encode inference.
+    // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
+    //                      the path will be generated from the ggml model path that was passed
+    //                      in to whisper_init_from_file. For example, if 'path_model' was
+    //                      "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
+    //                      assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
+    // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
+    // cache_dir: Optional cache directory that can speed up init time, especially for
+    //                     GPU, by caching compiled 'blobs' there.
+    //                     Set to nullptr if not used.
+    // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
+    WHISPER_API int whisper_ctx_init_openvino_encoder(
+        struct whisper_context * ctx,
+                    const char * model_path,
+                    const char * device,
+                    const char * cache_dir);
+
+    // Frees all allocated memory
+    WHISPER_API void whisper_free      (struct whisper_context * ctx);
+    WHISPER_API void whisper_free_state(struct whisper_state * state);
+    WHISPER_API void whisper_free_params(struct whisper_full_params * params);
+
+    // Convert RAW PCM audio to log mel spectrogram.
+    // The resulting spectrogram is stored inside the default state of the provided whisper context.
+    // Returns 0 on success
+    WHISPER_API int whisper_pcm_to_mel(
+            struct whisper_context * ctx,
+                       const float * samples,
+                               int   n_samples,
+                               int   n_threads);
+
+    WHISPER_API int whisper_pcm_to_mel_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                       const float * samples,
+                               int   n_samples,
+                               int   n_threads);
+
+    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
+    // The resulting spectrogram is stored inside the default state of the provided whisper context.
+    // Returns 0 on success
+    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
+        struct whisper_context * ctx,
+                   const float * samples,
+                           int   n_samples,
+                           int   n_threads);
+
+    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
+        struct whisper_context * ctx,
+          struct whisper_state * state,
+                   const float * samples,
+                           int   n_samples,
+                           int   n_threads);
+
+    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
+    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
+    // n_mel must be 80
+    // Returns 0 on success
+    WHISPER_API int whisper_set_mel(
+            struct whisper_context * ctx,
+                       const float * data,
+                               int   n_len,
+                               int   n_mel);
+
+    WHISPER_API int whisper_set_mel_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                       const float * data,
+                               int   n_len,
+                               int   n_mel);
+
+    // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
+    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
+    // offset can be used to specify the offset of the first frame in the spectrogram.
+    // Returns 0 on success
+    WHISPER_API int whisper_encode(
+            struct whisper_context * ctx,
+                               int   offset,
+                               int   n_threads);
+
+    WHISPER_API int whisper_encode_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                               int   offset,
+                               int   n_threads);
+
+    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
+    // Make sure to call whisper_encode() first.
+    // tokens + n_tokens is the provided context for the decoder.
+    // n_past is the number of tokens to use from previous decoder calls.
+    // Returns 0 on success
+    // TODO: add support for multiple decoders
+    WHISPER_API int whisper_decode(
+            struct whisper_context * ctx,
+               const whisper_token * tokens,
+                               int   n_tokens,
+                               int   n_past,
+                               int   n_threads);
+
+    WHISPER_API int whisper_decode_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+               const whisper_token * tokens,
+                               int   n_tokens,
+                               int   n_past,
+                               int   n_threads);
+
+    // Convert the provided text into tokens.
+    // The tokens pointer must be large enough to hold the resulting tokens.
+    // Returns the number of tokens on success, no more than n_max_tokens
+    // Returns -1 on failure
+    // TODO: not sure if correct
+    WHISPER_API int whisper_tokenize(
+            struct whisper_context * ctx,
+                        const char * text,
+                     whisper_token * tokens,
+                               int   n_max_tokens);
+
+    // Largest language id (i.e. number of available languages - 1)
+    WHISPER_API int whisper_lang_max_id();
+
+    // Return the id of the specified language, returns -1 if not found
+    // Examples:
+    //   "de" -> 2
+    //   "german" -> 2
+    WHISPER_API int whisper_lang_id(const char * lang);
+
+    // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
+    WHISPER_API const char * whisper_lang_str(int id);
+
+    // Use mel data at offset_ms to try and auto-detect the spoken language
+    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
+    // Returns the top language id or negative on failure
+    // If not null, fills the lang_probs array with the probabilities of all languages
+    // The array must be whisper_lang_max_id() + 1 in size
+    // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
+    WHISPER_API int whisper_lang_auto_detect(
+            struct whisper_context * ctx,
+                               int   offset_ms,
+                               int   n_threads,
+                             float * lang_probs);
+
+    WHISPER_API int whisper_lang_auto_detect_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                               int   offset_ms,
+                               int   n_threads,
+                             float * lang_probs);
+
+    WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
+    WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
+    WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
+    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
+
+    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_ftype        (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
+
+    // Token logits obtained from the last call to whisper_decode()
+    // The logits for the last token are stored in the last row
+    // Rows: n_tokens
+    // Cols: n_vocab
+    WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
+    WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
+
+    // Token Id -> String. Uses the vocabulary in the provided context
+    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
+    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
+
+
+    // Special tokens
+    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
+
+    // Task tokens
+    WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
+
+    // Performance information from the default state.
+    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
+    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
+
+    // Print system information
+    WHISPER_API const char * whisper_print_system_info(void);
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    // Available sampling strategies
+    enum whisper_sampling_strategy {
+        WHISPER_SAMPLING_GREEDY,      // similar to OpenAI's GreedyDecoder
+        WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
+    };
+
+    // Text segment callback
+    // Called on every newly generated text segment
+    // Use the whisper_full_...() functions to obtain the text segments
+    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
+
+    // Progress callback
+    typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
+
+    // Encoder begin callback
+    // If not NULL, called before the encoder starts
+    // If it returns false, the computation is aborted
+    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
+
+    // Logits filter callback
+    // Can be used to modify the logits before sampling
+    // If not NULL, called after applying temperature to logits
+    typedef void (*whisper_logits_filter_callback)(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+          const whisper_token_data * tokens,
+                               int   n_tokens,
+                             float * logits,
+                              void * user_data);
+
+    // Parameters for the whisper_full() function
+    // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
+    // whisper_full_default_params()
+    struct whisper_full_params {
+        enum whisper_sampling_strategy strategy;
+
+        int n_threads;
+        int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
+        int offset_ms;          // start offset in ms
+        int duration_ms;        // audio duration to process in ms
+
+        bool translate;
+        bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
+        bool single_segment;    // force single segment output (useful for streaming)
+        bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
+        bool print_progress;    // print progress information
+        bool print_realtime;    // print results from within whisper.cpp (avoid it, use callback instead)
+        bool print_timestamps;  // print timestamps for each text segment when printing realtime
+
+        // [EXPERIMENTAL] token-level timestamps
+        bool  token_timestamps; // enable token-level timestamps
+        float thold_pt;         // timestamp token probability threshold (~0.01)
+        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
+        int   max_len;          // max segment length in characters
+        bool  split_on_word;    // split on word rather than on token (when used with max_len)
+        int   max_tokens;       // max tokens per segment (0 = no limit)
+
+        // [EXPERIMENTAL] speed-up techniques
+        // note: these can significantly reduce the quality of the output
+        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
+        bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
+        int  audio_ctx;         // overwrite the audio context size (0 = use default)
+
+        // [EXPERIMENTAL] [TDRZ] tinydiarize
+        bool tdrz_enable;       // enable tinydiarize speaker turn detection
+
+        // tokens to provide to the whisper decoder as initial prompt
+        // these are prepended to any existing text context from a previous call
+        const char * initial_prompt;
+        const whisper_token * prompt_tokens;
+        int prompt_n_tokens;
+
+        // for auto-detection, set to nullptr, "" or "auto"
+        const char * language;
+        bool detect_language;
+
+        // common decoding parameters:
+        bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
+        bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
+
+        float temperature;      // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
+        float max_initial_ts;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
+        float length_penalty;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
+
+        // fallback parameters
+        // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
+        float temperature_inc;
+        float entropy_thold;    // similar to OpenAI's "compression_ratio_threshold"
+        float logprob_thold;
+        float no_speech_thold;  // TODO: not implemented
+
+        struct {
+            int best_of;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
+        } greedy;
+
+        struct {
+            int beam_size;  // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
+
+            float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
+        } beam_search;
+
+        // called for every newly generated text segment
+        whisper_new_segment_callback new_segment_callback;
+        void * new_segment_callback_user_data;
+
+        // called on each progress update
+        whisper_progress_callback progress_callback;
+        void * progress_callback_user_data;
+
+        // called each time before the encoder starts
+        whisper_encoder_begin_callback encoder_begin_callback;
+        void * encoder_begin_callback_user_data;
+
+        // called by each decoder to filter obtained logits
+        whisper_logits_filter_callback logits_filter_callback;
+        void * logits_filter_callback_user_data;
+    };
+
+    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
+    WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
+    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
+
+    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+    // Not thread safe for same context
+    // Uses the specified decoding strategy to obtain the text.
+    WHISPER_API int whisper_full(
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples);
+
+    WHISPER_API int whisper_full_with_state(
+                struct whisper_context * ctx,
+                  struct whisper_state * state,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples);
+
+    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
+    // Result is stored in the default state of the context
+    // Not thread safe if executed in parallel on the same context.
+    // It seems this approach can offer some speedup in some cases.
+    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
+    WHISPER_API int whisper_full_parallel(
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples,
+                                   int   n_processors);
+
+    // Number of generated text segments
+    // A segment can be a few words, a sentence, or even a paragraph.
+    WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
+    WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
+
+    // Language id associated with the context's default state
+    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
+
+    // Language id associated with the provided state
+    WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
+
+    // Get the start and end time of the specified segment
+    WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
+
+    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
+
+    // Get whether the next segment is predicted as a speaker turn
+    WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
+
+    // Get the text of the specified segment
+    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
+
+    // Get number of tokens in the specified segment
+    WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
+
+    // Get the token text of the specified token in the specified segment
+    WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
+
+    WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
+
+    // Get token data for the specified token in the specified segment
+    // This contains probabilities, timestamps, etc.
+    WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
+
+    // Get the probability of the specified token in the specified segment
+    WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    // Temporary helpers needed for exposing ggml interface
+
+    WHISPER_API int          whisper_bench_memcpy          (int n_threads);
+    WHISPER_API const char * whisper_bench_memcpy_str      (int n_threads);
+    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
+    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
+
+    // Control logging output; default behavior is to print to stderr
+
+    typedef void (*whisper_log_callback)(const char * line);
+    WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/stable-diffusion.cpp/ggml/ggml.pc.in b/stable-diffusion.cpp/ggml/ggml.pc.in
new file mode 100644
index 0000000000000000000000000000000000000000..5f53cb8768a0e79a24d29edb5128aa462041f01c
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/ggml.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=${prefix}/include
+libdir=${prefix}/lib
+
+Name: ggml
+Description: The GGML Tensor Library for Machine Learning
+Version: 0.0.0
+Cflags: -I${includedir}/ggml
+Libs: -L${libdir} -lggml
diff --git a/stable-diffusion.cpp/ggml/include/ggml/ggml-alloc.h b/stable-diffusion.cpp/ggml/include/ggml/ggml-alloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..e38758878b91a36b886c21732403809c78765557
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/include/ggml/ggml-alloc.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+struct ggml_backend_buffer;
+
+GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
+
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
+
+GGML_API void   ggml_allocr_free       (struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure (struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset      (struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc      (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);
+
+GGML_API size_t ggml_allocr_alloc_graph_n(
+                    struct ggml_allocr * alloc,
+                    struct ggml_cgraph ** graphs, int n_graphs,
+                    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/stable-diffusion.cpp/ggml/include/ggml/ggml-backend.h b/stable-diffusion.cpp/ggml/include/ggml/ggml-backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..da134b0dbed514423b1223814b2346c4048a2854
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/include/ggml/ggml-backend.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+    struct ggml_backend;
+    struct ggml_backend_buffer;
+
+    // type-erased backend-specific types / wrappers
+    typedef void * ggml_backend_context_t;
+    typedef void * ggml_backend_graph_plan_t;
+    typedef void * ggml_backend_buffer_context_t;
+
+    // avoid accessing internals of these types
+    typedef struct ggml_backend        * ggml_backend_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+
+    //
+    // backend buffer
+    //
+
+    struct ggml_backend_buffer_i {
+        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
+        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
+        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
+        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
+        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+    };
+
+    // TODO: hide behind API
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_i iface;
+
+        ggml_backend_t                backend;
+        ggml_backend_buffer_context_t context;
+
+        size_t size;
+    };
+
+    // backend buffer functions
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+            struct ggml_backend                  * backend,
+            struct ggml_backend_buffer_i           iface,
+                   ggml_backend_buffer_context_t   context,
+                   size_t                          size);
+
+    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+    //
+    // backend
+    //
+
+    struct ggml_backend_i {
+        const char * (*get_name)(ggml_backend_t backend);
+
+        void (*free)(ggml_backend_t backend);
+
+        // buffer allocation
+        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+
+        // get buffer alignment
+        size_t (*get_alignment)(ggml_backend_t backend);
+
+        // tensor data access
+        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void (*synchronize)     (ggml_backend_t backend);
+
+        // (optional) copy tensor between different backends, allow for single-copy tranfers
+        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // compute graph with a plan
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+        // compute graph without a plan
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // check if the backend supports an operation
+        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+    };
+
+    // TODO: hide behind API
+    struct ggml_backend {
+        struct ggml_backend_i iface;
+
+        ggml_backend_context_t context;
+    };
+
+    // backend helper functions
+    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
+
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+
+    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    //
+    // CPU backend
+    //
+
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/stable-diffusion.cpp/ggml/include/ggml/ggml.h b/stable-diffusion.cpp/ggml/include/ggml/ggml.h
new file mode 100644
index 0000000000000000000000000000000000000000..79cba14ce93bd0b3b3d114c4e69bf9c93a62829f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/include/ggml/ggml.h
@@ -0,0 +1,2130 @@
+#pragma once
+
+//
+// GGML Tensor Library
+//
+// This documentation is still a work in progress.
+// If you wish some specific topics to be covered, feel free to drop a comment:
+//
+//   https://github.com/ggerganov/whisper.cpp/issues/40
+//
+// ## Overview
+//
+// This library implements:
+//
+//  - a set of tensor operations
+//  - automatic differentiation
+//  - basic optimization algorithms
+//
+// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
+// but is not limited to, the following:
+//
+//  - linear regression
+//  - support vector machines
+//  - neural networks
+//
+// The library allows the user to define a certain function using the available tensor operations. This function
+// definition is represented internally via a computation graph. Each tensor operation in the function definition
+// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
+// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
+// using one of the available optimization algorithms.
+//
+// For example, here we define the function: f(x) = a*x^2 + b
+//
+//   {
+//       struct ggml_init_params params = {
+//           .mem_size   = 16*1024*1024,
+//           .mem_buffer = NULL,
+//       };
+//
+//       // memory allocation happens here
+//       struct ggml_context * ctx = ggml_init(params);
+//
+//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//
+//       ggml_set_param(ctx, x); // x is an input variable
+//
+//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
+//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
+//
+//       ...
+//   }
+//
+// Notice that the function definition above does not involve any actual computation. The computation is performed only
+// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
+//
+//   {
+//       ...
+//
+//       struct ggml_cgraph gf = ggml_build_forward(f);
+//
+//       // set the input variable and parameter values
+//       ggml_set_f32(x, 2.0f);
+//       ggml_set_f32(a, 3.0f);
+//       ggml_set_f32(b, 4.0f);
+//
+//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
+//
+//       printf("f = %f\n", ggml_get_f32_1d(f, 0));
+//
+//       ...
+//   }
+//
+// The actual computation is performed in the ggml_graph_compute() function.
+//
+// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
+// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
+// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
+// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
+// actually needed.
+//
+// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
+// differentiation and optimization algorithms.
+//
+// The described approach allows to define the function graph once and then compute its forward or backward graphs
+// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
+// the user can avoid the memory allocation overhead at runtime.
+//
+// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
+// citizens, but in theory the library can be extended to support FP8 and integer data types.
+//
+// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
+// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
+// clear that the library needs to support more complex operations. The way to support these operations is not clear
+// yet, but a few examples are demonstrated in the following operations:
+//
+//   - ggml_permute()
+//   - ggml_conv_1d_1s()
+//   - ggml_conv_1d_2s()
+//
+// For each tensor operator, the library implements a forward and backward computation function. The forward function
+// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
+// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
+// calculus class, or watch the following video:
+//
+//   What is Automatic Differentiation?
+//   https://www.youtube.com/watch?v=wG_nF1awSSY
+//
+//
+// ## Tensor data (struct ggml_tensor)
+//
+// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
+// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
+// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
+//
+//   {
+//       struct ggml_tensor * c = ggml_add(ctx, a, b);
+//
+//       assert(c->src[0] == a);
+//       assert(c->src[1] == b);
+//   }
+//
+// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
+// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
+// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
+// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
+// contiguous in memory.
+//
+// The data of the tensor is accessed via the "data" pointer. For example:
+//
+//   {
+//       const int nx = 2;
+//       const int ny = 3;
+//
+//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
+//
+//       for (int y = 0; y < ny; y++) {
+//           for (int x = 0; x < nx; x++) {
+//               *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
+//           }
+//       }
+//
+//       ...
+//   }
+//
+// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
+//
+// ## The matrix multiplication operator (ggml_mul_mat)
+//
+// TODO
+//
+//
+// ## Multi-threading
+//
+// TODO
+//
+//
+// ## Overview of ggml.c
+//
+// TODO
+//
+//
+// ## SIMD optimizations
+//
+// TODO
+//
+//
+// ## Debugging ggml
+//
+// TODO
+//
+//
+
+#ifdef GGML_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BUILD
+#            define GGML_API __declspec(dllexport)
+#        else
+#            define GGML_API __declspec(dllimport)
+#        endif
+#    else
+#        define GGML_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define GGML_API
+#endif
+
+// TODO: support for clang
+#ifdef __GNUC__
+#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define GGML_DEPRECATED(func, hint) func
+#endif
+
+#ifndef __GNUC__
+#    define GGML_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#define GGML_FILE_MAGIC   0x67676d6c // "ggml"
+#define GGML_FILE_VERSION 1
+
+#define GGML_QNT_VERSION        2    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
+#define GGML_MAX_DIMS          4
+#define GGML_MAX_NODES         16384
+#define GGML_MAX_PARAMS        1024
+#define GGML_MAX_CONTEXTS      64
+#define GGML_MAX_SRC           6
+#define GGML_MAX_NAME          64
+#define GGML_MAX_OP_PARAMS     32
+#define GGML_DEFAULT_N_THREADS 4
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+    #define GGML_MEM_ALIGN 4
+#else
+    #define GGML_MEM_ALIGN 16
+#endif
+
+#define GGML_EXIT_SUCCESS 0
+#define GGML_EXIT_ABORTED 1
+
+#define GGUF_MAGIC   0x46554747 // "GGUF"
+#define GGUF_VERSION 2
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
+#define GGML_UNUSED(x) (void)(x)
+
+#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
+
+#define GGML_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifndef NDEBUG
+#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#elif defined(__GNUC__)
+#define GGML_UNREACHABLE() __builtin_unreachable()
+#else
+#define GGML_UNREACHABLE() ((void) 0)
+#endif
+
+// used to copy the number of elements and stride in bytes of tensors into local variables.
+// main purpose is to reduce code duplication and improve readability.
+//
+// example:
+//
+//    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+//    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
+//
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_UNUSED(prefix##3);
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
+    typedef __fp16 ggml_fp16_t;
+#else
+    typedef uint16_t ggml_fp16_t;
+#endif
+
+    // convert FP16 <-> FP32
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
+
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
+
+    struct ggml_object;
+    struct ggml_context;
+
+    enum ggml_type {
+        GGML_TYPE_F32  = 0,
+        GGML_TYPE_F16  = 1,
+        GGML_TYPE_Q4_0 = 2,
+        GGML_TYPE_Q4_1 = 3,
+        // GGML_TYPE_Q4_2 = 4, support has been removed
+        // GGML_TYPE_Q4_3 (5) support has been removed
+        GGML_TYPE_Q5_0 = 6,
+        GGML_TYPE_Q5_1 = 7,
+        GGML_TYPE_Q8_0 = 8,
+        GGML_TYPE_Q8_1 = 9,
+        // k-quantizations
+        GGML_TYPE_Q2_K = 10,
+        GGML_TYPE_Q3_K = 11,
+        GGML_TYPE_Q4_K = 12,
+        GGML_TYPE_Q5_K = 13,
+        GGML_TYPE_Q6_K = 14,
+        GGML_TYPE_Q8_K = 15,
+        GGML_TYPE_I8,
+        GGML_TYPE_I16,
+        GGML_TYPE_I32,
+        GGML_TYPE_COUNT,
+    };
+
+    enum ggml_backend_type {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_GPU = 10,
+        GGML_BACKEND_GPU_SPLIT = 20,
+    };
+
+    // model file types
+    enum ggml_ftype {
+        GGML_FTYPE_UNKNOWN     = -1,
+        GGML_FTYPE_ALL_F32     = 0,
+        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+    };
+
+    // available tensor operations:
+    enum ggml_op {
+        GGML_OP_NONE = 0,
+
+        GGML_OP_DUP,
+        GGML_OP_ADD,
+        GGML_OP_ADD1,
+        GGML_OP_ACC,
+        GGML_OP_SUB,
+        GGML_OP_MUL,
+        GGML_OP_DIV,
+        GGML_OP_SQR,
+        GGML_OP_SQRT,
+        GGML_OP_LOG,
+        GGML_OP_SUM,
+        GGML_OP_SUM_ROWS,
+        GGML_OP_MEAN,
+        GGML_OP_ARGMAX,
+        GGML_OP_REPEAT,
+        GGML_OP_REPEAT_BACK,
+        GGML_OP_CONCAT,
+        GGML_OP_SILU_BACK,
+        GGML_OP_NORM, // normalize
+        GGML_OP_RMS_NORM,
+        GGML_OP_RMS_NORM_BACK,
+        GGML_OP_GROUP_NORM,
+
+        GGML_OP_MUL_MAT,
+        GGML_OP_OUT_PROD,
+
+        GGML_OP_SCALE,
+        GGML_OP_SET,
+        GGML_OP_CPY,
+        GGML_OP_CONT,
+        GGML_OP_RESHAPE,
+        GGML_OP_VIEW,
+        GGML_OP_PERMUTE,
+        GGML_OP_TRANSPOSE,
+        GGML_OP_GET_ROWS,
+        GGML_OP_GET_ROWS_BACK,
+        GGML_OP_DIAG,
+        GGML_OP_DIAG_MASK_INF,
+        GGML_OP_DIAG_MASK_ZERO,
+        GGML_OP_SOFT_MAX,
+        GGML_OP_SOFT_MAX_BACK,
+        GGML_OP_ROPE,
+        GGML_OP_ROPE_BACK,
+        GGML_OP_ALIBI,
+        GGML_OP_CLAMP,
+        GGML_OP_CONV_1D,
+        GGML_OP_CONV_1D_STAGE_0,  // internal
+        GGML_OP_CONV_1D_STAGE_1,  // internal
+        GGML_OP_CONV_TRANSPOSE_1D,
+        GGML_OP_CONV_2D,
+        GGML_OP_CONV_2D_STAGE_0, // internal
+        GGML_OP_CONV_2D_STAGE_1, // internal
+        GGML_OP_CONV_TRANSPOSE_2D,
+        GGML_OP_POOL_1D,
+        GGML_OP_POOL_2D,
+
+        GGML_OP_UPSCALE, // nearest interpolate
+
+        GGML_OP_FLASH_ATTN,
+        GGML_OP_FLASH_FF,
+        GGML_OP_FLASH_ATTN_BACK,
+        GGML_OP_WIN_PART,
+        GGML_OP_WIN_UNPART,
+        GGML_OP_GET_REL_POS,
+        GGML_OP_ADD_REL_POS,
+
+        GGML_OP_UNARY,
+
+        GGML_OP_MAP_UNARY,
+        GGML_OP_MAP_BINARY,
+
+        GGML_OP_MAP_CUSTOM1_F32,
+        GGML_OP_MAP_CUSTOM2_F32,
+        GGML_OP_MAP_CUSTOM3_F32,
+
+        GGML_OP_MAP_CUSTOM1,
+        GGML_OP_MAP_CUSTOM2,
+        GGML_OP_MAP_CUSTOM3,
+
+        GGML_OP_CROSS_ENTROPY_LOSS,
+        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+
+        GGML_OP_COUNT,
+    };
+
+    enum ggml_unary_op {
+        GGML_UNARY_OP_ABS,
+        GGML_UNARY_OP_SGN,
+        GGML_UNARY_OP_NEG,
+        GGML_UNARY_OP_STEP,
+        GGML_UNARY_OP_TANH,
+        GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU,
+        GGML_UNARY_OP_GELU_QUICK,
+        GGML_UNARY_OP_SILU,
+    };
+
+    enum ggml_object_type {
+        GGML_OBJECT_TENSOR,
+        GGML_OBJECT_GRAPH,
+        GGML_OBJECT_WORK_BUFFER
+    };
+
+    enum ggml_log_level {
+        GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_WARN = 3,
+        GGML_LOG_LEVEL_INFO = 4
+    };
+
+    // ggml object
+    struct ggml_object {
+        size_t offs;
+        size_t size;
+
+        struct ggml_object * next;
+
+        enum ggml_object_type type;
+
+        char padding[4];
+    };
+
+    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
+    // n-dimensional tensor
+    struct ggml_tensor {
+        enum ggml_type         type;
+        enum ggml_backend_type backend;
+
+        struct ggml_backend_buffer * buffer;
+
+        int     n_dims;
+        int64_t ne[GGML_MAX_DIMS]; // number of elements
+        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                                   // nb[0] = ggml_type_size(type)
+                                   // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
+                                   // nb[i] = nb[i-1] * ne[i-1]
+
+        // compute data
+        enum ggml_op op;
+
+        // op params - allocated as int32_t for alignment
+        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+
+        bool is_param;
+        bool not_own_data;
+        bool dynamic;
+        bool dynamic_hold;
+        int  n_dst;
+        int  n_dst_curr;
+
+        struct ggml_tensor * grad;
+        struct ggml_tensor * src[GGML_MAX_SRC];
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+
+        struct ggml_tensor * view_src;
+        size_t               view_offs;
+
+        void * data;
+
+        char name[GGML_MAX_NAME];
+
+        void * extra; // extra things e.g. for ggml-cuda.cu
+
+        char padding[4];
+    };
+
+    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+
+        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
+        int n_tasks[GGML_MAX_NODES];
+
+        // abort ggml_graph_compute when true
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
+    };
+
+    // next prime after GGML_MAX_NODES
+    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
+    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
+    // #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 16411
+    #define GGML_GRAPH_HASHTABLE_SIZE 32771
+
+    enum ggml_cgraph_eval_order {
+        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        GGML_CGRAPH_EVAL_ORDER_COUNT
+    };
+
+    // computation graph
+    struct ggml_cgraph {
+        int n_nodes;
+        int n_leafs;
+
+        struct ggml_tensor * nodes[GGML_MAX_NODES];
+        struct ggml_tensor * grads[GGML_MAX_NODES];
+        struct ggml_tensor * leafs[GGML_MAX_NODES];
+
+        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+
+        enum ggml_cgraph_eval_order order;
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+    };
+
+    static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
+
+    // scratch buffer
+    struct ggml_scratch {
+        size_t offs;
+        size_t size;
+        void * data;
+    };
+
+    struct ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+        bool   dynamic;    // allocate memory for the tensor data dynamically
+    };
+
+
+    // compute types
+
+    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
+    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
+    enum ggml_task_type {
+        GGML_TASK_INIT = 0,
+        GGML_TASK_COMPUTE,
+        GGML_TASK_FINALIZE,
+    };
+
+    struct ggml_compute_params {
+        enum ggml_task_type type;
+
+        // ith = thread index, nth = number of threads
+        int ith, nth;
+
+        // work buffer for all threads
+        size_t wsize;
+        void * wdata;
+    };
+
+    // misc
+
+    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
+    GGML_API int64_t ggml_time_ms(void);
+    GGML_API int64_t ggml_time_us(void);
+    GGML_API int64_t ggml_cycles(void);
+    GGML_API int64_t ggml_cycles_per_ms(void);
+
+    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems
+    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_API void    ggml_print_object (const struct ggml_object * obj);
+    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
+
+    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
+
+    GGML_API int     ggml_blck_size (enum ggml_type type);
+    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+
+    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
+
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
+
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);
+
+    // TODO: temporary until model loading of ggml examples is refactored
+    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+
+    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+
+    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
+    // use this to compute the memory overhead of a tensor
+    GGML_API size_t ggml_tensor_overhead(void);
+
+    // main
+
+    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API void                  ggml_free(struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
+    GGML_API size_t  ggml_used_mem_of_data(const struct ggml_context* ctx);
+
+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
+    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
+    GGML_API void    ggml_set_dynamic(struct ggml_context * ctx, bool dynamic);
+    GGML_API bool    ggml_get_dynamic(struct ggml_context* ctx);
+
+    GGML_API void    ggml_hold_dynamic_tensor(struct ggml_tensor * tensor);
+    GGML_API void    ggml_free_dynamic_tensor(struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_dynamic_size(void);
+    GGML_API size_t  ggml_max_dynamic_size(void);
+    GGML_API size_t  ggml_curr_max_dynamic_size(void);
+    GGML_API void    ggml_reset_curr_max_dynamic_size(void);
+
+
+    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int    n_dims,
+            const int64_t *ne);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2,
+            int64_t ne3);
+
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
+
+    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    // Converts a flat index into coordinates
+    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
+    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+
+    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
+    GGML_ATTRIBUTE_FORMAT(2, 3)
+    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
+
+    //
+    // operations on tensors with backpropagation
+    //
+
+    GGML_API struct ggml_tensor * ggml_dup(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_dup_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_add(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add_cast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            enum   ggml_type      type);
+
+    GGML_API struct ggml_tensor * ggml_add1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add1_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_acc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_acc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_sub(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_sub_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_mul(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_mul_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_div(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_div_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_sqr(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sqr_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sqrt(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sqrt_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // return scalar
+    GGML_API struct ggml_tensor * ggml_sum(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_API struct ggml_tensor * ggml_sum_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // mean along rows
+    GGML_API struct ggml_tensor * ggml_mean(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // argmax along rows
+    GGML_API struct ggml_tensor * ggml_argmax(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // if a is the same shape as b, and a is not parameter, return a
+    // otherwise, return a new tensor: repeat(a) to fit in b
+    GGML_API struct ggml_tensor * ggml_repeat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // sums repetitions in a into shape of b
+    GGML_API struct ggml_tensor * ggml_repeat_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // concat a and b on dim 2
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_concat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_abs(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_abs_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sgn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sgn_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_neg(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_neg_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_step(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_step_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_tanh(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_tanh_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_relu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_relu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // TODO: double-check this computation is correct
+    GGML_API struct ggml_tensor * ggml_gelu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_quick(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_silu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_silu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_silu_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // normalize along rows
+    GGML_API struct ggml_tensor * ggml_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_rms_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    // group normalize along ne0*ne1*n_groups
+    // used in stable-diffusion
+    // TODO: eps is hardcoded to 1e-6 for now
+    GGML_API struct ggml_tensor * ggml_group_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
+    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_rms_norm_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            float                 eps);
+
+    // A: k columns, n rows => [ne03, ne02, n, k]
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
+    GGML_API struct ggml_tensor * ggml_mul_mat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // A: m columns, n rows,
+    // B: p columns, n rows,
+    // result is m columns, p rows
+    GGML_API struct ggml_tensor * ggml_out_prod(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    //
+    // operations on tensors without backpropagation
+    //
+
+    GGML_API struct ggml_tensor * ggml_scale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // a -> b, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // a -> b, in-place, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // make contiguous
+    GGML_API struct ggml_tensor * ggml_cont(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // make contiguous, in-place
+    GGML_API struct ggml_tensor * ggml_cont_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // make contiguous, with new shape
+    GGML_API struct ggml_tensor * ggml_cont_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
+    GGML_API struct ggml_tensor * ggml_cont_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    GGML_API struct ggml_tensor * ggml_cont_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_API struct ggml_tensor * ggml_cont_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
+    // return view(a), b specifies the new shape
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
+    GGML_API struct ggml_tensor * ggml_reshape_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_API struct ggml_tensor * ggml_reshape_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
+    // offset in bytes
+    GGML_API struct ggml_tensor * ggml_view_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_view_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            size_t                nb1, // row stride in bytes
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_view_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_view_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_permute(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   axis0,
+            int                   axis1,
+            int                   axis2,
+            int                   axis3);
+
+    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
+    GGML_API struct ggml_tensor * ggml_transpose(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_get_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_get_rows_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c);
+
+    GGML_API struct ggml_tensor * ggml_diag(
+        struct ggml_context     * ctx,
+        struct ggml_tensor      * a);
+
+    // set elements above the diagonal to -INF
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // set elements above the diagonal to 0
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    GGML_API struct ggml_tensor * ggml_soft_max(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_soft_max_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // rotary position embedding
+    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
+    // if mode & 2 == 1, GPT-NeoX style
+    // if mode & 4 == 1, ChatGLM style
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
+    GGML_API struct ggml_tensor * ggml_rope(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx);
+
+    // custom RoPE
+    GGML_API struct ggml_tensor * ggml_rope_custom(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale);
+
+    // xPos RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            float                 base,
+            bool                  down);
+
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_API struct ggml_tensor * ggml_rope_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 xpos_base,
+            bool                  xpos_down);
+
+    // alibi position embedding
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_alibi(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_head,
+            float                 bias_max);
+
+    // clamp
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_clamp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 min,
+            float                 max);
+
+    GGML_API struct ggml_tensor * ggml_conv_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+
+    // conv_1d with padding = half
+    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s,
+            int                   d);
+
+    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   p0,
+            int                   d0);
+
+    GGML_API struct ggml_tensor * ggml_conv_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
+
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is equal to kernel size
+    // padding is zero
+    // example:
+    // a:     16   16    3  768
+    // b:   1024 1024    3    1
+    // res:   64   64  768    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is 1
+    // padding is half
+    // example:
+    // a:      3    3    256  256
+    // b:     64   64    256    1
+    // res:   64   64    256    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride);
+
+    enum ggml_op_pool {
+        GGML_OP_POOL_MAX,
+        GGML_OP_POOL_AVG,
+        GGML_OP_POOL_COUNT,
+    };
+
+    GGML_API struct ggml_tensor * ggml_pool_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0, // kernel size
+            int                   s0, // stride
+            int                   p0); // padding
+
+    GGML_API struct ggml_tensor * ggml_pool_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1);
+
+    // nearest interpolate
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_upscale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   scale_factor);
+
+    GGML_API struct ggml_tensor * ggml_flash_attn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * q,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            bool                  masked);
+
+    GGML_API struct ggml_tensor * ggml_flash_attn_back(
+           struct ggml_context * ctx,
+           struct ggml_tensor  * q,
+           struct ggml_tensor  * k,
+           struct ggml_tensor  * v,
+           struct ggml_tensor  * d,
+           bool                  masked);
+
+    GGML_API struct ggml_tensor * ggml_flash_ff(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b0,
+            struct ggml_tensor  * b1,
+            struct ggml_tensor  * c0,
+            struct ggml_tensor  * c1);
+
+    // partition into non-overlapping windows with padding if needed
+    // example:
+    // a:   768   64   64    1
+    // w:    14
+    // res: 768   14   14    25
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_win_part(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   w);
+
+    // reverse of ggml_win_part
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_win_unpart(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   w0,
+            int                   h0,
+            int                   w);
+
+    GGML_API struct ggml_tensor * ggml_unary(
+            struct ggml_context * ctx,
+             struct ggml_tensor * a,
+             enum ggml_unary_op op);
+
+    GGML_API struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op);
+
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_get_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   qh,
+            int                   kh);
+
+    // used in sam
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    // custom operators
+
+    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
+    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
+            struct ggml_context        * ctx,
+            struct ggml_tensor         * a,
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+            struct ggml_context        * ctx,
+            struct ggml_tensor         * a,
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+            struct ggml_tensor           * c,
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+            struct ggml_tensor           * c,
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3_inplace instead");
+
+    // custom operators v2
+
+    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
+
+    #define GGML_N_TASKS_MAX -1
+
+    GGML_API struct ggml_tensor * ggml_map_custom1(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    // loss function
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b);
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+            struct ggml_tensor          * c);
+
+    //
+    // automatic differentiation
+    //
+
+    GGML_API void ggml_set_param(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * tensor);
+
+
+    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
+
+    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
+    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
+
+    // graph allocation in a context
+    GGML_API struct ggml_cgraph * ggml_new_graph        (struct ggml_context * ctx);
+    GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API size_t ggml_graph_overhead(void);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
+    // print info and performance information for the graph
+    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
+
+    // dump the graph into a file using the dot format
+    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+
+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
+    // but without the second forward pass nodes.
+    GGML_API void ggml_build_backward_gradient_checkpointing(
+            struct ggml_context   * ctx,
+            struct ggml_cgraph    * gf,
+            struct ggml_cgraph    * gb,
+            struct ggml_cgraph    * gb_tmp,
+            struct ggml_tensor  * * checkpoints,
+            int                     n_checkpoints);
+    //
+    // optimization
+    //
+
+    // optimization methods
+    enum ggml_opt_type {
+        GGML_OPT_ADAM,
+        GGML_OPT_LBFGS,
+    };
+
+    // linesearch methods
+    enum ggml_linesearch {
+        GGML_LINESEARCH_DEFAULT = 1,
+
+        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
+        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
+        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+    };
+
+    // optimization return values
+    enum ggml_opt_result {
+        GGML_OPT_OK = 0,
+        GGML_OPT_DID_NOT_CONVERGE,
+        GGML_OPT_NO_CONTEXT,
+        GGML_OPT_INVALID_WOLFE,
+        GGML_OPT_FAIL,
+        GGML_OPT_CANCEL,
+
+        GGML_LINESEARCH_FAIL = -128,
+        GGML_LINESEARCH_MINIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
+        GGML_LINESEARCH_INVALID_PARAMETERS,
+    };
+
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+
+    // optimization parameters
+    //
+    //   see ggml.c (ggml_opt_default_params) for default values
+    //
+    struct ggml_opt_params {
+        enum ggml_opt_type type;
+
+        int n_threads;
+
+        // delta-based convergence test
+        //
+        //   if past == 0 - disabled
+        //   if past > 0:
+        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+        //
+        int past;
+        float delta;
+
+        // maximum number of iterations without improvement
+        //
+        //   if 0 - disabled
+        //   if > 0:
+        //     assume convergence if no cost improvement in this number of iterations
+        //
+        int max_no_improvement;
+
+        bool print_forward_graph;
+        bool print_backward_graph;
+
+        int n_gradient_accumulation;
+
+        // ADAM parameters
+        struct {
+            int n_iter;
+
+            float sched; // schedule multiplier (fixed, decay or warmup)
+            float decay; // weight decay for AdamW, use 0.0f to disable
+            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float eps_f; // epsilon for convergence test
+            float eps_g; // epsilon for convergence test
+            float gclip; // gradient clipping
+        } adam;
+
+        // LBFGS parameters
+        struct {
+            int m; // number of corrections to approximate the inv. Hessian
+            int n_iter;
+            int max_linesearch;
+
+            float eps;      // convergence tolerance
+            float ftol;     // line search tolerance
+            float wolfe;
+            float min_step;
+            float max_step;
+
+            enum ggml_linesearch linesearch;
+        } lbfgs;
+    };
+
+    struct ggml_opt_context {
+        struct ggml_context * ctx;
+        struct ggml_opt_params params;
+
+        int iter;
+        int64_t nx; // number of parameter elements
+
+        bool just_initialized;
+
+        float loss_before;
+        float loss_after;
+
+        struct {
+            struct ggml_tensor * g;  // current gradient
+            struct ggml_tensor * m;  // first moment
+            struct ggml_tensor * v;  // second moment
+            struct ggml_tensor * pf; // past function values
+            float fx_best;
+            float fx_prev;
+            int n_no_improvement;
+        } adam;
+
+        struct {
+            struct ggml_tensor * x;    // current parameters
+            struct ggml_tensor * xp;   // previous parameters
+            struct ggml_tensor * g;    // current gradient
+            struct ggml_tensor * gp;   // previous gradient
+            struct ggml_tensor * d;    // search direction
+            struct ggml_tensor * pf;   // past function values
+            struct ggml_tensor * lmal; // the L-BFGS memory alpha
+            struct ggml_tensor * lmys; // the L-BFGS memory ys
+            struct ggml_tensor * lms;  // the L-BFGS memory s
+            struct ggml_tensor * lmy;  // the L-BFGS memory y
+            float fx_best;
+            float step;
+            int j;
+            int k;
+            int end;
+            int n_no_improvement;
+        } lbfgs;
+    };
+
+    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+
+    // optimize the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt(
+            struct ggml_context * ctx,
+            struct ggml_opt_params params,
+            struct ggml_tensor * f);
+
+    // initialize optimizer context
+    GGML_API void ggml_opt_init(
+            struct ggml_context     * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_opt_params    params,
+            int64_t                   nx);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume_g(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f,
+            struct ggml_cgraph * gf,
+            struct ggml_cgraph * gb,
+            ggml_opt_callback callback,
+            void * callback_data);
+
+    //
+    // quantization
+    //
+
+    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
+
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+
+    //
+    // gguf
+    //
+
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_UINT64  = 10,
+        GGUF_TYPE_INT64   = 11,
+        GGUF_TYPE_FLOAT64 = 12,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
+
+    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
+
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
+
+    // will abort if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
+    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
+
+    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+
+    // overrides existing values or adds a new one
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
+    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
+    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
+    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+    // manage tensor info
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+
+    //
+    // system info
+    //
+
+    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx2       (void);
+    GGML_API int ggml_cpu_has_avx512     (void);
+    GGML_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_API int ggml_cpu_has_fma        (void);
+    GGML_API int ggml_cpu_has_neon       (void);
+    GGML_API int ggml_cpu_has_arm_fma    (void);
+    GGML_API int ggml_cpu_has_metal      (void);
+    GGML_API int ggml_cpu_has_f16c       (void);
+    GGML_API int ggml_cpu_has_fp16_va    (void);
+    GGML_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_API int ggml_cpu_has_blas       (void);
+    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_gpublas    (void);
+    GGML_API int ggml_cpu_has_sse3       (void);
+    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_vsx        (void);
+
+    //
+    // Internal types and functions exposed for tests and benchmarks
+    //
+
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
+    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+
+    typedef struct {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
+        ggml_to_float_t   to_float;
+        ggml_from_float_t from_float;
+        ggml_from_float_t from_float_reference;
+        ggml_vec_dot_t    vec_dot;
+        enum ggml_type    vec_dot_type;
+    } ggml_type_traits_t;
+
+    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/stable-diffusion.cpp/ggml/requirements.txt b/stable-diffusion.cpp/ggml/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9be8160aa17fef64eba47e12e9bbe6afec75d26e
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/requirements.txt
@@ -0,0 +1,7 @@
+accelerate==0.19.0
+numpy==1.24.3
+sentencepiece==0.1.98
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+transformers==4.29.2
\ No newline at end of file
diff --git a/stable-diffusion.cpp/ggml/scripts/sync-llama.sh b/stable-diffusion.cpp/ggml/scripts/sync-llama.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b9b7aed14f1a52da3cf3a05c1c50f9574bbe51dc
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/scripts/sync-llama.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+cp -rpv ../llama.cpp/ggml.c           src/ggml.c
+cp -rpv ../llama.cpp/ggml-alloc.c     src/ggml-alloc.c
+cp -rpv ../llama.cpp/ggml-backend.c   src/ggml-backend.c
+cp -rpv ../llama.cpp/ggml-cuda.h      src/ggml-cuda.h
+cp -rpv ../llama.cpp/ggml-cuda.cu     src/ggml-cuda.cu
+cp -rpv ../llama.cpp/ggml-opencl.h    src/ggml-opencl.h
+cp -rpv ../llama.cpp/ggml-opencl.cpp  src/ggml-opencl.cpp
+cp -rpv ../llama.cpp/ggml-metal.h     src/ggml-metal.h
+cp -rpv ../llama.cpp/ggml-metal.m     src/ggml-metal.m
+cp -rpv ../llama.cpp/ggml-metal.metal src/ggml-metal.metal
+cp -rpv ../llama.cpp/ggml.h           include/ggml/ggml.h
+cp -rpv ../llama.cpp/ggml-alloc.h     include/ggml/ggml-alloc.h
+cp -rpv ../llama.cpp/ggml-backend.h   include/ggml/ggml-backend.h
+
+cp -rpv ../llama.cpp/tests/test-opt.cpp           tests/test-opt.cpp
+cp -rpv ../llama.cpp/tests/test-grad0.cpp         tests/test-grad0.cpp
+cp -rpv ../llama.cpp/tests/test-quantize-fns.cpp  tests/test-quantize-fns.cpp
+cp -rpv ../llama.cpp/tests/test-quantize-perf.cpp tests/test-quantize-perf.cpp
diff --git a/stable-diffusion.cpp/ggml/scripts/sync-whisper.sh b/stable-diffusion.cpp/ggml/scripts/sync-whisper.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c17a091da9b49b4ba1d46297d84046dfc9b606e9
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/scripts/sync-whisper.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+cp -rpv ../whisper.cpp/ggml.c                         src/ggml.c
+cp -rpv ../whisper.cpp/ggml-alloc.c                   src/ggml-alloc.c
+cp -rpv ../whisper.cpp/ggml-cuda.h                    src/ggml-cuda.h
+cp -rpv ../whisper.cpp/ggml-cuda.cu                   src/ggml-cuda.cu
+cp -rpv ../whisper.cpp/ggml-opencl.h                  src/ggml-opencl.h
+cp -rpv ../whisper.cpp/ggml-opencl.cpp                src/ggml-opencl.cpp
+cp -rpv ../whisper.cpp/ggml-metal.h                   src/ggml-metal.h
+cp -rpv ../whisper.cpp/ggml-metal.m                   src/ggml-metal.m
+cp -rpv ../whisper.cpp/ggml-metal.metal               src/ggml-metal.metal
+cp -rpv ../whisper.cpp/ggml.h                         include/ggml/ggml.h
+cp -rpv ../whisper.cpp/ggml-alloc.h                   include/ggml/ggml-alloc.h
+cp -rpv ../whisper.cpp/examples/common.h              examples/common.h
+cp -rpv ../whisper.cpp/examples/common.cpp            examples/common.cpp
+cp -rpv ../whisper.cpp/examples/common-ggml.h         examples/common-ggml.h
+cp -rpv ../whisper.cpp/examples/common-ggml.cpp       examples/common-ggml.cpp
+cp -rpv ../whisper.cpp/whisper.h                      examples/whisper/whisper.h
+cp -rpv ../whisper.cpp/whisper.cpp                    examples/whisper/whisper.cpp
+cp -rpv ../whisper.cpp/examples/main/main.cpp         examples/whisper/main.cpp
+cp -rpv ../whisper.cpp/examples/quantize/quantize.cpp examples/whisper/quantize.cpp
diff --git a/stable-diffusion.cpp/ggml/src/CMakeLists.txt b/stable-diffusion.cpp/ggml/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b225597edae62d7bd0f554ef3e43569cc2b1217f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/CMakeLists.txt
@@ -0,0 +1,326 @@
+if (GGML_ALL_WARNINGS)
+    if (NOT MSVC)
+        add_compile_options(-Wunused -Wextra -Wcast-qual -Wdouble-promotion)
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:-Wshadow;-Wno-unused-function;-Wmissing-prototypes>")
+    else()
+        # todo : windows
+    endif()
+endif()
+
+# compiler flags
+
+if (NOT MSVC)
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
+endif()
+
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+
+if (NOT UNAME_S)
+    execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
+endif()
+if (NOT UNAME_P)
+    execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
+endif()
+if (NOT UNAME_M)
+    execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
+endif()
+#message(STATUS "UNAME_S: ${UNAME_S}  UNAME_P: ${UNAME_P}  UNAME_M: ${UNAME_M}")
+
+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+if (UNAME_S MATCHES "Darwin")
+    if (NOT UNAME_P MATCHES "arm")
+        execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
+	if (SYSCTL_M MATCHES "1")
+            #set(UNAME_P "arm")
+            #set(UNAME_M "arm64")
+	    message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
+	endif()
+    endif()
+endif()
+
+if (${CMAKE_SYSTEM_NAME} STREQUAL "Emscripten")
+    message(STATUS "Emscripten detected")
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+    message(STATUS "ARM detected")
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PPC64 detected")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector")
+else()
+    message(STATUS "x86 detected")
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
+    if (UNAME_S MATCHES "Darwin")
+        execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
+        if (AVX1_M MATCHES "AVX1.0")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+        endif()
+        execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
+        if (AVX2_M MATCHES "AVX2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        endif()
+        if (AVX1_M MATCHES "FMA")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+        endif()
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+    elseif (UNAME_S MATCHES "Linux")
+        message(STATUS "Linux detected")
+        execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
+        if (AVX1_M MATCHES "avx")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+        endif()
+        execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
+        if (AVX2_M MATCHES "avx2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        endif()
+        execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
+        if (FMA_M MATCHES "fma")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+        endif()
+        execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
+        if (F16C_M MATCHES "f16c")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+        endif()
+        execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
+        if (SSE3_M MATCHES "sse3")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
+        endif()
+    elseif (UNAME_S MATCHES "Haiku")
+        message(STATUS "Haiku detected")
+        execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
+        if (AVX1_M MATCHES "avx")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+        endif()
+        execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
+        if (AVX2_M MATCHES "avx2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        endif()
+        execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
+        if (FMA_M MATCHES "fma")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+        endif()
+        execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
+        if (F16C_M MATCHES "f16c")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+        endif()
+    elseif (MSVC)
+        if (GGML_AVX512)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512")
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (GGML_AVX512_VBMI)
+                add_compile_definitions(__AVX512VBMI__)
+            endif()
+            if (GGML_AVX512_VNNI)
+                add_compile_definitions(__AVX512VNNI__)
+            endif()
+        elseif (GGML_AVX2)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
+        elseif (GGML_AVX)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
+        endif()
+    else()
+        set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
+    endif()
+endif()
+
+# ggml
+
+set(TARGET ggml)
+
+# on APPLE - include Accelerate framework
+if (APPLE AND NOT GGML_NO_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
+if (GGML_OPENBLAS)
+    set(OPENBLAS_INCLUDE_SEARCH_PATHS
+        /usr/include
+        /usr/include/openblas
+        /usr/include/openblas-base
+        /usr/local/include
+        /usr/local/include/openblas
+        /usr/local/include/openblas-base
+        /opt/OpenBLAS/include
+        $ENV{OpenBLAS_HOME}
+        $ENV{OpenBLAS_HOME}/include
+        )
+    find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+    find_library(OPENBLAS_LIB NAMES openblas libopenblas)
+    if (OPENBLAS_LIB)
+        message(STATUS "OpenBLAS found")
+
+        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${OPENBLAS_LIB})
+        set(GGML_EXTRA_INCS  ${GGML_EXTRA_INCS}  ${OPENBLAS_INC})
+	set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+    else()
+        message(WARNING "OpenBLAS not found")
+    endif()
+endif()
+
+if (GGML_CLBLAST)
+	set(CLBLAST_INCLUDE_SEARCH_PATHS
+        /usr/include
+        /usr/local/include
+	    $ENV{CLBLAST_HOME}
+	    $ENV{CLBLAST_HOME}/include
+        )
+	find_path(CLBLAST_INC NAMES clblast.h PATHS ${CLBLAST_INCLUDE_SEARCH_PATHS})
+	find_library(CLBLAST_LIB NAMES clblast)
+	find_library(OPENCL_LIB NAMES OpenCL)
+	if (CLBLAST_LIB AND OPENCL_LIB AND CLBLAST_INC)
+		message(STATUS "clBLAST found")
+
+		set(GGML_EXTRA_INCS  ${GGML_EXTRA_INCS}  ${CLBLAST_INC})
+		set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${CLBLAST_LIB}  ${OPENCL_LIB})
+		set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_CLBLAST)
+
+		set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
+
+		link_libraries("-Wl,--copy-dt-needed-entries")
+    else()
+        message(WARNING "clBLAST not found")
+    endif()
+endif()
+
+if (GGML_CUBLAS)
+    cmake_minimum_required(VERSION 3.17)
+
+    find_package(CUDAToolkit)
+    if (CUDAToolkit_FOUND)
+        message(STATUS "cuBLAS found")
+
+        enable_language(CUDA)
+
+        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+
+        add_compile_definitions(GGML_USE_CUBLAS)
+
+        if (GGML_STATIC)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+        else()
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+        endif()
+
+        if (CMAKE_BUILD_TYPE MATCHES Debug)
+            set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
+        endif()
+    else()
+        message(WARNING "cuBLAS not found")
+    endif()
+endif()
+
+if (GGML_METAL)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h)
+
+    add_compile_definitions(GGML_USE_METAL)
+    #add_compile_definitions(GGML_METAL_NDEBUG)
+
+    # get full path to the file
+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # copy ggml-metal.metal to bin directory
+    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+        )
+endif()
+
+if (GGML_PERF)
+    set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
+endif()
+
+add_library(${TARGET}
+    ggml.c
+    ggml-alloc.c
+    ggml-backend.c
+    ../include/ggml/ggml.h
+    ../include/ggml/ggml-alloc.h
+    ../include/ggml/ggml-backend.h
+    ${GGML_CUDA_SOURCES}
+    ${GGML_OPENCL_SOURCES}
+    ${GGML_METAL_SOURCES}
+    )
+
+target_include_directories(${TARGET} PUBLIC
+    .
+    ../include
+    ../include/ggml
+    ${GGML_EXTRA_INCS}
+    )
+
+if (MSVC)
+    target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+else()
+    target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+endif()
+
+if (BUILD_SHARED_LIBS)
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+    target_link_libraries(${TARGET} PUBLIC
+        ${CMAKE_DL_LIBS}
+        )
+
+    target_compile_definitions(${TARGET} PUBLIC
+        GGML_SHARED
+        )
+
+    target_compile_definitions(${TARGET} PRIVATE
+        GGML_BUILD
+        )
+
+    if (GGML_METAL)
+        set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+    endif()
+endif()
+
+target_compile_definitions(${TARGET} PUBLIC
+    ${GGML_EXTRA_FLAGS}
+    )
+
+if (MINGW)
+    target_link_libraries(${TARGET} PUBLIC
+        stdc++
+        )
+endif()
+
+if (GGML_CUDA_SOURCES)
+    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
+    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES "52;61;70")
+    set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
+    if (NOT MSVC)
+        target_link_libraries(ggml PUBLIC stdc++)
+    endif()
+endif()
+
+set (GGML_PUBLIC_HEADERS
+     ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml/ggml.h
+     ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml/ggml-alloc.h)
+set_target_properties(${TARGET} PROPERTIES
+                      PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
+
+install(TARGETS ${TARGET}
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib/static
+    PUBLIC_HEADER DESTINATION include/ggml
+    )
diff --git a/stable-diffusion.cpp/ggml/src/ggml-alloc.c b/stable-diffusion.cpp/ggml/src/ggml-alloc.c
new file mode 100644
index 0000000000000000000000000000000000000000..34eba3f830e8496a7d9bc77edb3b0ddc2bd93017
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-alloc.c
@@ -0,0 +1,594 @@
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml.h"
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#define UNUSED(x) (void)(x)
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
+
+//#define GGML_ALLOCATOR_DEBUG
+
+//#define AT_PRINTF printf
+#define AT_PRINTF(...) ((void)0)
+
+struct hash_node {
+    struct ggml_tensor * t;
+    int n_children;
+    int n_views;
+};
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
+    size_t h = hash(t);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i].t != NULL) {
+        if (hash_table[i].t == t) {
+            return &hash_table[i];
+        }
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    hash_table[i].t = t;
+    return &hash_table[i];
+}
+
+// TODO: GGML_PAD ?
+static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
+    assert(alignment && !(alignment & (alignment - 1))); // power of 2
+    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
+    return offset + align;
+}
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+#define MAX_FREE_BLOCKS 256
+
+struct ggml_allocr {
+    struct ggml_backend_buffer * buffer;
+    bool buffer_owned;
+    void * data;
+    size_t alignment;
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+    size_t max_size;
+    bool measure;
+    int parse_seq[GGML_MAX_CONCUR];
+    int parse_seq_len;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+#ifdef GGML_ALLOCATOR_DEBUG
+static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == NULL) {
+            alloc->allocated_tensors[i] = tensor;
+            return;
+        }
+    }
+    GGML_ASSERT(!"out of allocated_tensors");
+}
+static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == tensor ||
+            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
+            alloc->allocated_tensors[i] = NULL;
+            return;
+        }
+    }
+    printf("tried to free tensor %s not found\n", tensor->name);
+    GGML_ASSERT(!"tensor not found");
+}
+#endif
+
+// check if a tensor is allocated by this buffer
+static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
+    return tensor->buffer == alloc->buffer;
+}
+
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->view_src != NULL;
+}
+
+void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
+    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
+
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+
+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+
+    size_t max_avail = 0;
+
+    // find the best fitting free block besides the last block
+    int best_fit_block = -1;
+    size_t best_fit_size = SIZE_MAX;
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size && block->size <= best_fit_size) {
+            best_fit_block = i;
+            best_fit_size = block->size;
+        }
+    }
+
+    AT_PRINTF("block %d\n", best_fit_block);
+
+    if (best_fit_block == -1) {
+        // the last block is our last resort
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size) {
+            best_fit_block = alloc->n_free_blocks - 1;
+        } else {
+            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                    __func__, size, max_avail);
+            GGML_ASSERT(!"not enough space in the buffer");
+            return;
+        }
+    }
+    struct free_block * block = &alloc->free_blocks[best_fit_block];
+    void * addr = block->addr;
+    block->addr = (char*)block->addr + size;
+    block->size -= size;
+    if (block->size == 0) {
+        // remove block if empty
+        alloc->n_free_blocks--;
+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
+        }
+    }
+
+    tensor->data = addr;
+    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
+    tensor->buffer = alloc->buffer;
+    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, tensor);
+    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    if (cur_max > alloc->max_size) {
+        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        for (int i = 0; i < 1024; i++) {
+            if (alloc->allocated_tensors[i]) {
+                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
+            }
+        }
+        printf("\n");
+    }
+#endif
+
+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
+}
+
+// this is a very naive implementation, but for our case the number of free blocks should be very small
+static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    if (ggml_allocr_is_own(alloc, tensor) == false) {
+        // the tensor was not allocated in this buffer
+        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
+        // the easiest way to deal with this is just to ignore it
+        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
+        return;
+    }
+
+    void * ptr = tensor->data;
+
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
+
+    ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, tensor);
+#endif
+
+    // see if we can merge with an existing block
+    for (int i = 0; i < alloc->n_free_blocks; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        // check if ptr is at the end of the block
+        if ((char*)block->addr + block->size == ptr) {
+            block->size += size;
+            // check if we can merge with the next block
+            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
+                block->size += alloc->free_blocks[i+1].size;
+                alloc->n_free_blocks--;
+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+        // check if ptr is at the beginning of the block
+        if ((char*)ptr + size == block->addr) {
+            block->addr = ptr;
+            block->size += size;
+            // check if we can merge with the previous block
+            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
+                alloc->free_blocks[i-1].size += block->size;
+                alloc->n_free_blocks--;
+                for (int j = i; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+    }
+    // otherwise, add a new block
+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    }
+    // insert the new block
+    alloc->free_blocks[insert_pos].addr = ptr;
+    alloc->free_blocks[insert_pos].size = size;
+    alloc->n_free_blocks++;
+}
+
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
+    for (int i = 0; i < n; i++) {
+        alloc->parse_seq[i] = list[i];
+    }
+    alloc->parse_seq_len = n;
+}
+
+void ggml_allocr_reset(struct ggml_allocr * alloc) {
+    alloc->n_free_blocks = 1;
+    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
+    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
+    alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
+}
+
+struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
+
+    *alloc = (struct ggml_allocr){
+        /*.buffer        = */ buffer,
+        /*.buffer_owned  = */ true,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.parse_seq_len = */ 0,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
+    alloc->measure = true;
+
+    return alloc;
+}
+
+struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
+
+    *alloc = (struct ggml_allocr){
+        /*.buffer        = */ buffer,
+        /*.buffer_owned  = */ false,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
+        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.parse_seq_len = */ 0,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+void ggml_allocr_free(struct ggml_allocr * alloc) {
+    if (alloc->buffer_owned) {
+        ggml_backend_buffer_free(alloc->buffer);
+    }
+    free(alloc);
+}
+
+bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
+    return alloc->measure;
+}
+
+//////////// compute graph allocator
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool ggml_op_can_inplace(enum ggml_op op) {
+    switch (op) {
+        case GGML_OP_SCALE:
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_UNARY:
+        case GGML_OP_ROPE:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SOFT_MAX:
+            return true;
+
+        default:
+            return false;
+    }
+}
+
+static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
+    assert(view->view_src != NULL && view->view_src->data != NULL);
+    view->backend = view->view_src->backend;
+    view->buffer  = view->view_src->buffer;
+    view->data    = (char *)view->view_src->data + view->view_offs;
+
+    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
+    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
+    assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
+    ggml_backend_buffer_init_tensor(alloc->buffer, view);
+}
+
+static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
+    struct hash_node * ht = alloc->hash_table;
+    if (node->data == NULL) {
+        if (ggml_is_view(node)) {
+            init_view(alloc, node);
+        } else {
+            // see if we can reuse a parent's buffer (inplace)
+            if (ggml_op_can_inplace(node->op)) {
+                for (int i = 0; i < GGML_MAX_SRC; i++) {
+                    struct ggml_tensor * parent = node->src[i];
+                    if (parent == NULL) {
+                        break;
+                    }
+
+                    // if the node's data is external, then we cannot re-use it
+                    if (ggml_allocr_is_own(alloc, parent) == false) {
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        continue;
+                    }
+
+                    struct hash_node * p_hn = hash_get(ht, parent);
+                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
+                        if (ggml_is_view(parent)) {
+                            struct ggml_tensor * view_src = parent->view_src;
+                            struct hash_node * view_src_hn = hash_get(ht, view_src);
+                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
+                                // the parent's data that it will need later (same layout requirement). the problem is that then
+                                // we cannot free the tensor because the original address of the allocation is lost.
+                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
+                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
+                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                                node->view_src = view_src;
+                                view_src_hn->n_views += 1;
+                                init_view(alloc, node);
+                                return;
+                            }
+                        }
+                        else {
+                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                            node->view_src = parent;
+                            p_hn->n_views += 1;
+                            init_view(alloc, node);
+                            return;
+                        }
+                    }
+                }
+            }
+            ggml_allocr_alloc(alloc, node);
+        }
+    }
+}
+
+size_t ggml_allocr_alloc_graph_n(
+    struct ggml_allocr * alloc,
+    struct ggml_cgraph ** graphs, int n_graphs,
+    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
+
+    // reset hash table
+    struct hash_node * ht = alloc->hash_table;
+    memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
+
+    // count number of children and views
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_tensor * node = gf->nodes[i];
+
+            if (ggml_is_view(node)) {
+                struct ggml_tensor * view_src = node->view_src;
+                hash_get(ht, view_src)->n_views += 1;
+                if (node->buffer == NULL && node->data != NULL) {
+                    // view of a pre-allocated tensor, didn't call init_view() yet
+                    init_view(alloc, node);
+                }
+            }
+
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                hash_get(ht, parent)->n_children += 1;
+                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
+                    init_view(alloc, parent);
+                }
+            }
+        }
+    }
+
+    // allocate tensors
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
+        // graph inputs are allocated first to ensure that they are not overwritten by each other
+        if (inputs != NULL && inputs[g] != NULL) {
+            for (int i = 0; inputs[g][i] != NULL; i++) {
+                struct ggml_tensor * input = inputs[g][i];
+                AT_PRINTF("input: %s\n", input->name);
+                allocate_node(alloc, input);
+            }
+        }
+        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
+        int last_barrier_pos = 0;
+        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
+
+        for (int ind = 0; ind < n_nodes; ind++) {
+            // allocate a node if there is no parse_seq or this is not a barrier
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
+                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
+                struct ggml_tensor * node = gf->nodes[i];
+
+                // allocate parents (leafs)
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    allocate_node(alloc, parent);
+                }
+
+                // allocate node
+                allocate_node(alloc, node);
+
+                AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    AT_PRINTF("%s", parent->name);
+                    if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                        AT_PRINTF(", ");
+                    }
+                }
+                AT_PRINTF("\n");
+            }
+
+            // update parents
+            // update immediately if there is no parse_seq
+            // update only at barriers if there is parse_seq
+            if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
+                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
+                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
+                for (int i = update_start; i < update_end; i++) {
+                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
+                    struct ggml_tensor * node = gf->nodes[node_i];
+
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * parent = node->src[j];
+                        if (parent == NULL) {
+                            break;
+                        }
+                        struct hash_node * p_hn = hash_get(ht, parent);
+                        p_hn->n_children -= 1;
+
+                        //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
+
+                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                            if (ggml_is_view(parent)) {
+                                struct ggml_tensor * view_src = parent->view_src;
+                                struct hash_node * view_src_hn = hash_get(ht, view_src);
+                                view_src_hn->n_views -= 1;
+                                AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
+                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
+                                    ggml_allocr_free_tensor(alloc, view_src);
+                                }
+                            }
+                            else {
+                                if (parent->data != node->data) {
+                                    ggml_allocr_free_tensor(alloc, parent);
+                                }
+                            }
+                        }
+                    }
+                }
+                AT_PRINTF("\n");
+                if (alloc->parse_seq_len) {
+                    last_barrier_pos = ind + 1;
+                }
+            }
+        }
+        // free graph outputs here that wouldn't be freed otherwise because they have no children
+        if (outputs != NULL && outputs[g] != NULL) {
+            for (int i = 0; outputs[g][i] != NULL; i++) {
+                struct ggml_tensor * output = outputs[g][i];
+                AT_PRINTF("output: %s\n", output->name);
+                ggml_allocr_free_tensor(alloc, output);
+            }
+        }
+    }
+
+    return alloc->max_size;
+}
+
+size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
+    return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
+}
+
+size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
+    return alloc->max_size;
+}
diff --git a/stable-diffusion.cpp/ggml/src/ggml-backend.c b/stable-diffusion.cpp/ggml/src/ggml-backend.c
new file mode 100644
index 0000000000000000000000000000000000000000..ca8d83dafe47c9763b7f648b9d26bd4e6dfb985e
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-backend.c
@@ -0,0 +1,385 @@
+#include "ggml-backend.h"
+#include "ggml-alloc.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED GGML_UNUSED
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// backend buffer
+
+ggml_backend_buffer_t ggml_backend_buffer_init(
+        struct ggml_backend                  * backend,
+        struct ggml_backend_buffer_i           iface,
+               ggml_backend_buffer_context_t   context,
+               size_t                          size) {
+    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
+
+    GGML_ASSERT(iface.get_base != NULL);
+
+    (*buffer) = (struct ggml_backend_buffer) {
+        /* .interface = */ iface,
+        /* .backend   = */ backend,
+        /* .context   = */ context,
+        /* .size      = */ size,
+    };
+
+    return buffer;
+}
+
+void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+    if (buffer->iface.free_buffer != NULL) {
+        buffer->iface.free_buffer(buffer);
+    }
+    free(buffer);
+}
+
+size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
+    return ggml_backend_get_alignment(buffer->backend);
+}
+
+void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_base(buffer);
+}
+
+size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
+    return buffer->size;
+}
+
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->iface.get_alloc_size) {
+        return buffer->iface.get_alloc_size(buffer, tensor);
+    }
+    return ggml_nbytes(tensor);
+}
+
+void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->iface.init_tensor) {
+        buffer->iface.init_tensor(buffer, tensor);
+    }
+}
+
+void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->iface.free_tensor) {
+        buffer->iface.free_tensor(buffer, tensor);
+    }
+}
+
+// backend
+
+ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
+    return tensor->buffer->backend;
+}
+
+const char * ggml_backend_name(ggml_backend_t backend) {
+    return backend->iface.get_name(backend);
+}
+
+void ggml_backend_free(ggml_backend_t backend) {
+    backend->iface.free(backend);
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
+    return backend->iface.alloc_buffer(backend, size);
+}
+
+size_t ggml_backend_get_alignment(ggml_backend_t backend) {
+    return backend->iface.get_alignment(backend);
+}
+
+void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_synchronize(ggml_backend_t backend) {
+    backend->iface.synchronize(backend);
+}
+
+ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return backend->iface.graph_plan_create(backend, cgraph);
+}
+
+void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    backend->iface.graph_plan_free(backend, plan);
+}
+
+void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    backend->iface.graph_plan_compute(backend, plan);
+}
+
+void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    backend->iface.graph_compute(backend, cgraph);
+}
+
+bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return backend->iface.supports_op(backend, op);
+}
+
+// backend copy
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
+    //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
+    //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
+    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+
+    // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
+
+    if (src == dst) {
+        return;
+    }
+
+    // TODO: allow backends to support copy to/from same backend
+
+    if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
+        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
+    } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
+        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
+    } else {
+        // shouldn't be hit when copying from/to CPU
+        #ifndef NDEBUG
+        fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
+        #endif
+        size_t nbytes = ggml_nbytes(src);
+        void * data = malloc(nbytes);
+        ggml_backend_tensor_get(src, data, 0, nbytes);
+        ggml_backend_tensor_set(dst, data, 0, nbytes);
+        free(data);
+    }
+}
+
+// backend CPU
+
+struct ggml_backend_cpu_context {
+    int n_threads;
+    void * work_data;
+    size_t work_size;
+};
+
+static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
+    return "CPU";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+    free(cpu_ctx->work_data);
+    free(cpu_ctx);
+    free(backend);
+}
+
+static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)buffer->context;
+}
+
+static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
+    /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
+    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL, // no initialization required
+    /* .free_tensor    = */ NULL, // no cleanup required
+};
+
+// for buffers from ptr, free is not called
+static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
+    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL,
+    /* .free_tensor    = */ NULL,
+};
+
+static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
+
+static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
+    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
+    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
+
+    return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
+}
+
+static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
+    return TENSOR_ALIGNMENT;
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
+    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+struct ggml_backend_plan_cpu {
+    struct ggml_cplan cplan;
+    struct ggml_cgraph cgraph;
+};
+
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cgraph = *cgraph;
+
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+    }
+
+    return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    free(cpu_plan->cplan.work_data);
+    free(cpu_plan);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+
+    if (cpu_ctx->work_size < cplan.work_size) {
+        // TODO: may be faster to free and use malloc to avoid the copy
+        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
+        cpu_ctx->work_size = cplan.work_size;
+    }
+
+    cplan.work_data = cpu_ctx->work_data;
+
+    ggml_graph_compute(cgraph, &cplan);
+}
+
+static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return true;
+    UNUSED(backend);
+    UNUSED(op);
+}
+
+static struct ggml_backend_i cpu_backend_i = {
+    /* .get_name            = */ ggml_backend_cpu_name,
+    /* .free                = */ ggml_backend_cpu_free,
+    /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_cpu_get_alignment,
+    /* .set_tensor_async    = */ ggml_backend_cpu_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_cpu_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_cpu_synchronize,
+    /* .cpy_tensor_from     = */ ggml_backend_cpu_cpy_tensor_from,
+    /* .cpy_tensor_to       = */ ggml_backend_cpu_cpy_tensor_to,
+    /* .graph_plan_create   = */ ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free     = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_compute  = */ ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute       = */ ggml_backend_cpu_graph_compute,
+    /* .supports_op         = */ ggml_backend_cpu_supports_op,
+};
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+
+    ctx->n_threads = GGML_DEFAULT_N_THREADS;
+    ctx->work_data = NULL;
+    ctx->work_size = 0;
+
+    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+
+    *cpu_backend = (struct ggml_backend) {
+        /* .interface = */ cpu_backend_i,
+        /* .context   = */ ctx
+    };
+    return cpu_backend;
+}
+
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_cpu_name;
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
+}
+
+ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
+    return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
+}
diff --git a/stable-diffusion.cpp/ggml/src/ggml-cuda.cu b/stable-diffusion.cpp/ggml/src/ggml-cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5bd83bb5c056a42ff32497b8daa1cfe050acce00
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-cuda.cu
@@ -0,0 +1,7781 @@
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <stdint.h>
+#include <stdio.h>
+#include <atomic>
+#include <assert.h>
+
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#else
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#endif // defined(GGML_USE_HIPBLAS)
+
+#include "ggml-cuda.h"
+#include "ggml.h"
+
+#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_VOLTA      700
+#define CC_OFFSET_AMD 1000000
+#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int&>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int&>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(__gfx1100__)
+    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+    int tmp1;
+    int tmp2;
+    asm("\n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        "
+        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+        : "v"(a), "v"(b)
+    );
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+#endif // defined(GGML_USE_HIPBLAS)
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        cudaError_t err_ = (err);                                                       \
+        if (err_ != cudaSuccess) {                                                      \
+            int id;                                                                     \
+            cudaGetDevice(&id);                                                         \
+            fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
+                cudaGetErrorString(err_));                                              \
+            fprintf(stderr, "current device: %d\n", id);                                \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+#if CUDART_VERSION >= 12000
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            int id;                                                                     \
+            cudaGetDevice(&id);                                                         \
+            fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n",                         \
+                    err_, __FILE__, __LINE__, cublasGetStatusString(err_));             \
+            fprintf(stderr, "current device: %d\n", id);                                \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+#else
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            int id;                                                                     \
+            cudaGetDevice(&id);                                                         \
+            fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);  \
+            fprintf(stderr, "current device: %d\n", id);                                \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+#endif // CUDART_VERSION >= 11
+
+#if CUDART_VERSION >= 11100
+#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
+#ifdef GGML_CUDA_F16
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef float2 dfloat2;
+#endif //GGML_CUDA_F16
+
+static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
+    const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
+    const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
+    return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
+    return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+template<typename T>
+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<half> to_fp16_cuda_t;
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_cuda_op_mul_mat_t)(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, const cudaStream_t & stream);
+typedef void (*ggml_cuda_op_flatten_t)(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+#define QI4_0 (QK4_0 / (4 * QR4_0))
+typedef struct {
+    half    d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+#define QR4_1 2
+#define QI4_1 (QK4_1 / (4 * QR4_1))
+typedef struct {
+    half2   dm;             // dm.x = delta, dm.y = min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+#define QR5_0 2
+#define QI5_0 (QK5_0 / (4 * QR5_0))
+typedef struct {
+    half d;                 // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+#define QR5_1 2
+#define QI5_1 (QK5_1 / (4 * QR5_1))
+typedef struct {
+    half2 dm;               // dm.x = delta, dm.y = min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+#define QR8_0 1
+#define QI8_0 (QK8_0 / (4 * QR8_0))
+typedef struct {
+    half    d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+typedef struct {
+    half2   ds;             // ds.x = delta, ds.y = sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
+
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
+typedef void (*load_tiles_cuda_t)(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_cuda_t)(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
+
+//================================= k-quants
+
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    half2 dm;                // super-block scale for quantized scales/mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#ifdef GGML_QKK_64
+    uint8_t scales[2]; // scales, quantized with 8 bits
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    half d;             // super-block scale
+} block_q3_K;
+//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
+
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half    dm[2];             // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct {
+    half2 dm;                  // super-block scale for quantized scales/mins
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half d;                  // super-block scale
+    int8_t scales[QK_K/16];  // block scales
+    uint8_t qh[QK_K/8];      // quants, high bit
+    uint8_t qs[QK_K/2];      // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct {
+    half2 dm;                     // super-block scale for quantized scales/mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
+typedef struct {
+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales
+    half    d;         // delta
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
+
+#define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#define CUDA_ADD_BLOCK_SIZE 256
+#define CUDA_MUL_BLOCK_SIZE 256
+#define CUDA_GELU_BLOCK_SIZE 256
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_CPY_BLOCK_SIZE 32
+#define CUDA_SCALE_BLOCK_SIZE 256
+#define CUDA_ROPE_BLOCK_SIZE 256
+#define CUDA_ALIBI_BLOCK_SIZE 32
+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+#define CUDA_QUANTIZE_BLOCK_SIZE 256
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_CUDA_DMMV_X
+#define GGML_CUDA_DMMV_X 32
+#endif
+#ifndef GGML_CUDA_MMV_Y
+#define GGML_CUDA_MMV_Y 1
+#endif
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 2
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
+#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+#define MAX_STREAMS 8
+static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
+
+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+// this is faster on Windows
+// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
+inline cudaError_t ggml_cuda_set_device(const int device) {
+    int current_device;
+    CUDA_CHECK(cudaGetDevice(&current_device));
+
+    if (device == current_device) {
+        return cudaSuccess;
+    }
+
+    return cudaSetDevice(device);
+}
+
+static int g_device_count = -1;
+static int g_main_device = 0;
+static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+static bool g_mul_mat_q = true;
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 0; // disabled by default
+static size_t g_scratch_offset = 0;
+
+static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= kx) {
+        return;
+    }
+    dst[i] = x[i] + y[i%ky];
+}
+
+static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = __hadd(x[i], __float2half(y[i]));
+}
+
+static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= kx) {
+        return;
+    }
+    dst[i] = x[i] * y[i%ky];
+}
+
+static __global__ void gelu_f32(const float * x, float * dst, const int k) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
+}
+
+static __global__ void silu_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + expf(-x[i]));
+}
+
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    }
+    return a;
+}
+
+template <int block_size>
+static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    const float eps = 1e-5f;
+
+    float2 mean_var = make_float2(0.f, 0.f);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        mean_var.x += xi;
+        mean_var.y += xi * xi;
+    }
+
+    // sum up partial sums
+    mean_var = warp_reduce_sum(mean_var);
+    if (block_size > WARP_SIZE) {
+        __shared__ float2 s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        __syncthreads();
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var);
+    }
+
+    const float mean = mean_var.x / ncols;
+    const float var = mean_var.y / ncols - mean * mean;
+    const float inv_std = rsqrtf(var + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
+    }
+}
+
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    }
+    return x;
+}
+
+template <int block_size>
+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    const float mean = tmp / ncols;
+    const float scale = rsqrtf(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = scale * x[row*ncols + col];
+    }
+}
+
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {8.0f, 8.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x = (v.x - 8.0f) * d;
+    v.y = (v.y - 8.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x = (v.x * d) + m;
+    v.y = (v.y * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {16.0f, 16.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x = (v.x - 16.0f) * d;
+    v.y = (v.y - 16.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x = (v.x * d) + m;
+    v.y = (v.y * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x = x[ib].qs[iqs + 0];
+    v.y = x[ib].qs[iqs + 1];
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+#else
+    v.x *= d;
+    v.y *= d;
+#endif // GGML_CUDA_F16
+}
+
+//================================== k-quants
+
+template<typename dst_t>
+static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int i   = blockIdx.x;
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int tid = threadIdx.x;
+#if QK_K == 256
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+#else
+    const int is = tid/16;  // 0 or 1
+    const int il = tid%16;  // 0...15
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    dst_t * y = yy + i*QK_K + 16*is + il;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
+#endif
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int i = blockIdx.x;
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+#if QK_K == 256
+    const int r = threadIdx.x/4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16*is0 + 4*(threadIdx.x%4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+#else
+    const int tid = threadIdx.x;
+    const int is  = tid/16;  // 0 or 1
+    const int il  = tid%16;  // 0...15
+    const int im  = il/8;    // 0...1
+    const int in  = il%8;    // 0...7
+
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    const uint8_t h = x[i].hmask[in] >> (2*is + im);
+    const float   d = (float)x[i].d;
+
+    if (is == 0) {
+        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    } else {
+        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    }
+#endif
+
+}
+
+#if QK_K == 256
+static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int i = blockIdx.x;
+
+#if QK_K == 256
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+#else
+    const int tid = threadIdx.x;
+    const uint8_t * q = x[i].qs;
+    dst_t * y = yy + i*QK_K;
+    const float d = (float)x[i].dm[0];
+    const float m = (float)x[i].dm[1];
+    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
+    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
+#endif
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int i = blockIdx.x;
+
+#if QK_K == 256
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = threadIdx.x;
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    const int tid = threadIdx.x;
+    const uint8_t q = x[i].qs[tid];
+    const int im = tid/8;  // 0...3
+    const int in = tid%8;  // 0...7
+    const int is = tid/16; // 0 or 1
+    const uint8_t h = x[i].qh[in] >> im;
+    const float d = x[i].d;
+    dst_t * y = yy + i*QK_K + tid;
+    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
+    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
+#endif
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int i = blockIdx.x;
+#if QK_K == 256
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = threadIdx.x;
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+#else
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int ip  = tid/16;         // 0 or 1
+    const int il  = tid - 16*ip;    // 0...15
+
+    dst_t * y = yy + i*QK_K + 16*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t   ql = x[i].ql[16*ip + il];
+    const uint8_t   qh = x[i].qh[il] >> (2*ip);
+    const int8_t  * sc = x[i].scales;
+
+    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+#endif
+}
+
+static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q2_K * x = (const block_q2_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
+
+        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp += dall * sum1 - dmin * sum2;
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;
+
+    uint32_t uaux[2];
+    const uint8_t * d = (const uint8_t *)uaux;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint32_t * s = (const uint32_t *)x[i].scales;
+
+        uaux[0] = s[0] & 0x0f0f0f0f;
+        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
+
+        const float2 dall = __half22float2(x[i].dm);
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t ql = q[l];
+            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+                  + y[l+16] * d[1] * ((ql >> 2) & 3)
+                  + y[l+32] * d[2] * ((ql >> 4) & 3)
+                  + y[l+48] * d[3] * ((ql >> 6) & 3);
+            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
+        }
+        tmp += dall.x * sum1 - dall.y * sum2;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q3_K * x = (const block_q3_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+        const uint8_t * h = x[i].hmask + l0;
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp += d * sum;
+
+    }
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
+    const int in = offset/8;                                 // 0 or 1
+    const int im = offset%8;                                 // 0...7
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint8_t * s = x[i].scales;
+
+        const float dall = (float)x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t hl = x[i].hmask[im+l] >> in;
+            const uint8_t ql = q[l];
+            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
+                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
+                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
+                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q4_K * x = (const block_q4_K *)vx + ib0;
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
+
+    const int il  = tid/step;                            // 0...3
+    const int ir  = tid - step*il;                       // 0...7 or 0...3
+    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+#if K_QUANTS_PER_ITERATION == 2
+    uint32_t q32[4];
+    const uint8_t * q4 = (const uint8_t *)q32;
+#else
+    uint16_t q16[4];
+    const uint8_t * q4 = (const uint8_t *)q16;
+#endif
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y1 = yy + i*QK_K + y_offset;
+        const float   * y2 = y1 + 128;
+
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+#if K_QUANTS_PER_ITERATION == 2
+        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
+        const uint32_t * q2 = q1 + 16;
+
+        q32[0] = q1[0] & 0x0f0f0f0f;
+        q32[1] = q1[0] & 0xf0f0f0f0;
+        q32[2] = q2[0] & 0x0f0f0f0f;
+        q32[3] = q2[0] & 0xf0f0f0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 4; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
+            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#else
+        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
+        const uint16_t * q2 = q1 + 32;
+
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[0] & 0xf0f0;
+        q16[2] = q2[0] & 0x0f0f;
+        q16[3] = q2[0] & 0xf0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 2; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
+            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#endif
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    float tmp = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const float   * y = yy + i*QK_K + step;
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+        const float d = (float)x[i].dm[0];
+        const float m = (float)x[i].dm[1];
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
+                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
+                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
+                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
+        }
+        tmp += sum;
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
+
+    const int row = blockIdx.x;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q5_K * x = (const block_q5_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = threadIdx.x/2;  // 0...15
+    const int ix  = threadIdx.x%2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    uint16_t q16[8];
+    const uint8_t * q4 = (const uint8_t *)q16;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        const uint8_t * ql1 = x[i].qs + q_offset;
+        const uint8_t * qh  = x[i].qh + l0;
+        const float   * y1  = yy + i*QK_K + y_offset;
+        const float   * y2  = y1 + 128;
+
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        float4 sum = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        const uint16_t * q1 = (const uint16_t *)ql1;
+        const uint16_t * q2 = q1 + 32;
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[8] & 0x0f0f;
+        q16[2] = (q1[0] >> 4) & 0x0f0f;
+        q16[3] = (q1[8] >> 4) & 0x0f0f;
+        q16[4] = q2[0] & 0x0f0f;
+        q16[5] = q2[8] & 0x0f0f;
+        q16[6] = (q2[0] >> 4) & 0x0f0f;
+        q16[7] = (q2[8] >> 4) & 0x0f0f;
+        for (int l = 0; l < n; ++l) {
+            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
+            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
+            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
+            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
+    }
+
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+    const int step = tid * K_QUANTS_PER_ITERATION;
+    const int im = step/8;
+    const int in = step%8;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const int8_t  * s = x[i].scales;
+        const float   * y = yy + i*QK_K + step;
+        const float     d = x[i].d;
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            const uint8_t h = x[i].qh[in+j] >> im;
+            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
+                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
+                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
+                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
+
+#if QK_K == 256
+
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+#if K_QUANTS_PER_ITERATION == 1
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+#else
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+#endif
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * ql = x[i].ql + ql_offset;
+        const uint8_t * qh = x[i].qh + qh_offset;
+        const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = x[i].d;
+
+#if K_QUANTS_PER_ITERATION == 1
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp += sum;
+#else
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp += sum;
+#endif
+
+    }
+
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + step;
+        const uint8_t * ql = x[i].ql + step;
+        const uint8_t * qh = x[i].qh + step;
+        const int8_t  * s  = x[i].scales;
+
+        const float d = x[i+0].d;
+
+        float sum = 0;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
+                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
+                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
+                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
+        }
+        tmp += sum;
+
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const half * x = (const half *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x = x[ib + iqs + 0];
+    v.y = x[ib + iqs + 1];
+}
+
+static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const float * x = (const float *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x = x[ib + iqs + 0];
+    v.y = x[ib + iqs + 1];
+}
+
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
+    const int ix = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (ix >= kx_padded) {
+        return;
+    }
+
+    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
+
+    const int i_padded = iy*kx_padded + ix;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i_padded / QK8_1; // block index
+    const int iqs = i_padded % QK8_1; // quant index
+
+    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    float amax = fabsf(xi);
+    float sum = xi;
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
+        sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
+}
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
+    const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int r = y[row];
+
+    // copy x[r*ncols + col] to dst[row*ncols + col]
+    const int xi = r*ncols + col;
+    const int di = row*ncols + col;
+
+    const int ib = xi/qk; // block index
+    const int iqs = (xi%qk)/qr; // quant index
+    const int iybs = di - di%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(x, ib, iqs, v);
+
+    dst[iybs + iqs + 0]        = v.x;
+    dst[iybs + iqs + y_offset] = v.y;
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
+    const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0]        = v.x;
+    y[iybs + iqs + y_offset] = v.y;
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
+    const int * v, const int * u, const float & d4, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
+    const float d4d8 = tmp.x;
+    const float m4s8 = tmp.y;
+#else
+    const float2 dm4f = __half22float2(dm4);
+    const float2 ds8f = __half22float2(ds8);
+    const float d4d8 = dm4f.x * ds8f.x;
+    const float m4s8 = dm4f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
+    const float d5d8 = tmp.x;
+    const float m5s8 = tmp.y;
+#else
+    const float2 dm5f = __half22float2(dm5);
+    const float2 ds8f = __half22float2(ds8);
+    const float d5d8 = dm5f.x * ds8f.x;
+    const float m5s8 = dm5f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
+    const int * v, const int * u, const float & d8_0, const float & d8_1) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * sumi;
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
+    const float d8d8 = tmp.x;
+    const float m8s8 = tmp.y;
+#else
+    const float2 dm8f = __half22float2(dm8);
+    const float2 ds8f = __half22float2(ds8);
+    const float d8d8 = dm8f.x * ds8f.x;
+    const float m8s8 = dm8f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+#define VDR_Q2_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const half2 & dm2, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return dm2f.x*sumf_d - dm2f.y*sumf_m;
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const half2 & dm2, const float & d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi_d = 0;
+    int sumi_m = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
+        int sumi_d_sc = 0;
+
+        const int sc = scales[i0 / (QI8_1/2)];
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
+        }
+
+        sumi_d += sumi_d_sc * (sc & 0xF);
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+#define VDR_Q3_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d3, const float & d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+
+    }
+
+    const float2 dm5f = __half22float2(dm5);
+
+    return dm5f.x*sumf_d - dm5f.y*sumf_m;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
+    const float & d6, const float * __restrict__ d8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
+            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
+
+            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
+            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+        }
+
+        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
+    }
+
+    return d6 * sumf_d;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
+
+    *x_ql = tile_x_qs;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_0;
+    const int kqsx = k % QI4_0;
+
+    const block_q4_0 * bx0 = (block_q4_0 *) vx;
+
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const float * x_dmf = (float *) x_dm;
+
+    int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
+
+    *x_ql = tile_x_qs;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_1;
+    const int kqsx = k % QI4_1;
+
+    const block_q4_1 * bx0 = (block_q4_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
+
+    *x_ql = tile_x_ql;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_0;
+    const int kqsx = k % QI5_0;
+
+    const block_q5_0 * bx0 = (block_q5_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset < nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_1;
+    const int kqsx = k % QI5_1;
+
+    const block_q5_1 * bx0 = (block_q5_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
+
+    int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
+
+    *x_ql = tile_x_qs;
+    *x_dm = (half2 *) tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI8_0;
+    const int kqsx = k % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+    const block_q8_0 * bx0 = (block_q8_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI2_K;
+    const int kqsx = k % QI2_K;
+
+    const block_q2_K * bx0 = (block_q2_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kbx = k / QI2_K;
+    const int ky  = (k % QI2_K) * QR2_K;
+    const float * y_df = (const float *) y_ds;
+
+    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+    }
+
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = bq3_K->d;
+
+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
+    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_qh = tile_x_qh;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI3_K;
+    const int kqsx = k % QI3_K;
+
+    const block_q3_K * bx0 = (block_q3_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
+
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
+
+        const int ksc = k % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kbx  = k / QI3_K;
+    const int ky  = (k % QI3_K) * QR3_K;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int shift = 2 * ((ky % 32) / 8);
+        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vlh = (vh << 2) & 0x04040404;
+
+        v[l] = __vsubss4(vll, vlh);
+    }
+
+    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2half(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    const uint16_t * a = (const uint16_t *)bq4_K->scales;
+    aux16[0] = a[0] & 0x0f0f;
+    aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+    const float dall = bq4_K->dm[0];
+    const float dmin = bq4_K->dm[1];
+
+    const float d8_1 = __low2float(bq8_1[0].ds);
+    const float d8_2 = __low2float(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
+    const int v1 = q4[0];
+    const int v2 = q4[4];
+
+    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
+    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
+    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
+    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
+
+    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
+    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
+
+    return dall * sumf_d - dmin * sumf_m;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI4_K; // == k if QK_K == 256
+
+    const block_q4_K * bx0 = (block_q4_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+#else
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+
+    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2float(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int8_t * s = bq5_K->scales;
+
+    const float d = bq5_K->d;
+
+    const float d8_1 = __low2half(bq8_1[0].ds);
+    const float d8_2 = __low2half(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * ql = (const int *)bq5_K->qs + (iqs/2);
+    const int vl1 = ql[0];
+    const int vl2 = ql[4];
+
+    const int step = 4 * (iqs/2); // 0, 4, 8, 12
+    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
+    const int in = step%8; // 0, 4, 0, 4
+    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
+
+    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
+    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
+    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
+    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
+
+    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
+                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
+
+    return d * sumf_d;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI5_K; // == k if QK_K == 256
+
+    const block_q5_K * bx0 = (block_q5_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
+
+        const int * scales = (int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+
+    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
+}
+
+template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
+
+    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
+    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
+    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI6_K; // == k if QK_K == 256
+
+    const block_q6_K * bx0 = (block_q6_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+    }
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
+
+    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+}
+
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void mul_mat_q(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const int row_dst_0 = blockIdx.x*mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const int col_dst_0 = blockIdx.y*mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    int   * tile_x_ql = nullptr;
+    half2 * tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
+    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
+
+    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr; ++ir) {
+            const int kqs = ir*WARP_SIZE + threadIdx.x;
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
+                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = __low2half(*dsi_src);
+                }
+            }
+
+            __syncthreads();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+                            threadIdx.x + i, threadIdx.y + j, k);
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const int col_dst = col_dst_0 + j + threadIdx.y;
+
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+            const int row_dst = row_dst_0 + threadIdx.x + i;
+
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
+        }
+    }
+}
+
+#define  MMQ_X_Q4_0_RDNA2  64
+#define  MMQ_Y_Q4_0_RDNA2  128
+#define NWARPS_Q4_0_RDNA2  8
+#define  MMQ_X_Q4_0_RDNA1  64
+#define  MMQ_Y_Q4_0_RDNA1  64
+#define NWARPS_Q4_0_RDNA1  8
+#define  MMQ_X_Q4_0_AMPERE 64
+#define  MMQ_Y_Q4_0_AMPERE 128
+#define NWARPS_Q4_0_AMPERE 4
+#define  MMQ_X_Q4_0_PASCAL 64
+#define  MMQ_Y_Q4_0_PASCAL 64
+#define NWARPS_Q4_0_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+    const int nwarps = NWARPS_Q4_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+    const int nwarps = NWARPS_Q4_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+    const int nwarps = NWARPS_Q4_0_AMPERE;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+    const int nwarps = NWARPS_Q4_0_PASCAL;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_0_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_1_RDNA2  64
+#define  MMQ_Y_Q4_1_RDNA2  128
+#define NWARPS_Q4_1_RDNA2  8
+#define  MMQ_X_Q4_1_RDNA1  64
+#define  MMQ_Y_Q4_1_RDNA1  64
+#define NWARPS_Q4_1_RDNA1  8
+#define  MMQ_X_Q4_1_AMPERE 64
+#define  MMQ_Y_Q4_1_AMPERE 128
+#define NWARPS_Q4_1_AMPERE 4
+#define  MMQ_X_Q4_1_PASCAL 64
+#define  MMQ_Y_Q4_1_PASCAL 64
+#define NWARPS_Q4_1_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+    const int nwarps = NWARPS_Q4_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+    const int nwarps = NWARPS_Q4_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+    const int nwarps = NWARPS_Q4_1_AMPERE;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+    const int nwarps = NWARPS_Q4_1_PASCAL;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_1_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_0_RDNA2  64
+#define  MMQ_Y_Q5_0_RDNA2  128
+#define NWARPS_Q5_0_RDNA2  8
+#define  MMQ_X_Q5_0_RDNA1  64
+#define  MMQ_Y_Q5_0_RDNA1  64
+#define NWARPS_Q5_0_RDNA1  8
+#define  MMQ_X_Q5_0_AMPERE 128
+#define  MMQ_Y_Q5_0_AMPERE 64
+#define NWARPS_Q5_0_AMPERE 4
+#define  MMQ_X_Q5_0_PASCAL 64
+#define  MMQ_Y_Q5_0_PASCAL 64
+#define NWARPS_Q5_0_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+    const int nwarps = NWARPS_Q5_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+    const int nwarps = NWARPS_Q5_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+    const int nwarps = NWARPS_Q5_0_AMPERE;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+    const int nwarps = NWARPS_Q5_0_PASCAL;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_0_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_1_RDNA2  64
+#define  MMQ_Y_Q5_1_RDNA2  128
+#define NWARPS_Q5_1_RDNA2  8
+#define  MMQ_X_Q5_1_RDNA1  64
+#define  MMQ_Y_Q5_1_RDNA1  64
+#define NWARPS_Q5_1_RDNA1  8
+#define  MMQ_X_Q5_1_AMPERE 128
+#define  MMQ_Y_Q5_1_AMPERE 64
+#define NWARPS_Q5_1_AMPERE 4
+#define  MMQ_X_Q5_1_PASCAL 64
+#define  MMQ_Y_Q5_1_PASCAL 64
+#define NWARPS_Q5_1_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+    const int nwarps = NWARPS_Q5_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+    const int nwarps = NWARPS_Q5_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+    const int nwarps = NWARPS_Q5_1_AMPERE;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+    const int nwarps = NWARPS_Q5_1_PASCAL;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_1_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q8_0_RDNA2  64
+#define  MMQ_Y_Q8_0_RDNA2  128
+#define NWARPS_Q8_0_RDNA2  8
+#define  MMQ_X_Q8_0_RDNA1  64
+#define  MMQ_Y_Q8_0_RDNA1  64
+#define NWARPS_Q8_0_RDNA1  8
+#define  MMQ_X_Q8_0_AMPERE 128
+#define  MMQ_Y_Q8_0_AMPERE 64
+#define NWARPS_Q8_0_AMPERE 4
+#define  MMQ_X_Q8_0_PASCAL 64
+#define  MMQ_Y_Q8_0_PASCAL 64
+#define NWARPS_Q8_0_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+    const int nwarps = NWARPS_Q8_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+    const int nwarps = NWARPS_Q8_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+    const int nwarps = NWARPS_Q8_0_AMPERE;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+    const int nwarps = NWARPS_Q8_0_PASCAL;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q8_0_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q2_K_RDNA2  64
+#define  MMQ_Y_Q2_K_RDNA2  128
+#define NWARPS_Q2_K_RDNA2  8
+#define  MMQ_X_Q2_K_RDNA1  128
+#define  MMQ_Y_Q2_K_RDNA1  32
+#define NWARPS_Q2_K_RDNA1  8
+#define  MMQ_X_Q2_K_AMPERE 64
+#define  MMQ_Y_Q2_K_AMPERE 128
+#define NWARPS_Q2_K_AMPERE 4
+#define  MMQ_X_Q2_K_PASCAL 64
+#define  MMQ_Y_Q2_K_PASCAL 64
+#define NWARPS_Q2_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+    const int nwarps = NWARPS_Q2_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+    const int nwarps = NWARPS_Q2_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+    const int nwarps = NWARPS_Q2_K_AMPERE;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+    const int nwarps = NWARPS_Q2_K_PASCAL;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q2_K_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q3_K_RDNA2  128
+#define  MMQ_Y_Q3_K_RDNA2  64
+#define NWARPS_Q3_K_RDNA2  8
+#define  MMQ_X_Q3_K_RDNA1  32
+#define  MMQ_Y_Q3_K_RDNA1  128
+#define NWARPS_Q3_K_RDNA1  8
+#define  MMQ_X_Q3_K_AMPERE 128
+#define  MMQ_Y_Q3_K_AMPERE 128
+#define NWARPS_Q3_K_AMPERE 4
+#define  MMQ_X_Q3_K_PASCAL 64
+#define  MMQ_Y_Q3_K_PASCAL 64
+#define NWARPS_Q3_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+    const int nwarps = NWARPS_Q3_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+    const int nwarps = NWARPS_Q3_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+    const int nwarps = NWARPS_Q3_K_AMPERE;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+    const int nwarps = NWARPS_Q3_K_PASCAL;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q3_K_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_K_RDNA2  64
+#define  MMQ_Y_Q4_K_RDNA2  128
+#define NWARPS_Q4_K_RDNA2  8
+#define  MMQ_X_Q4_K_RDNA1  32
+#define  MMQ_Y_Q4_K_RDNA1  64
+#define NWARPS_Q4_K_RDNA1  8
+#define  MMQ_X_Q4_K_AMPERE 64
+#define  MMQ_Y_Q4_K_AMPERE 128
+#define NWARPS_Q4_K_AMPERE 4
+#define  MMQ_X_Q4_K_PASCAL 64
+#define  MMQ_Y_Q4_K_PASCAL 64
+#define NWARPS_Q4_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+    const int nwarps = NWARPS_Q4_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+    const int nwarps = NWARPS_Q4_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+    const int nwarps = NWARPS_Q4_K_AMPERE;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+    const int nwarps = NWARPS_Q4_K_PASCAL;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_K_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_K_RDNA2  64
+#define  MMQ_Y_Q5_K_RDNA2  128
+#define NWARPS_Q5_K_RDNA2  8
+#define  MMQ_X_Q5_K_RDNA1  32
+#define  MMQ_Y_Q5_K_RDNA1  64
+#define NWARPS_Q5_K_RDNA1  8
+#define  MMQ_X_Q5_K_AMPERE 64
+#define  MMQ_Y_Q5_K_AMPERE 128
+#define NWARPS_Q5_K_AMPERE 4
+#define  MMQ_X_Q5_K_PASCAL 64
+#define  MMQ_Y_Q5_K_PASCAL 64
+#define NWARPS_Q5_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+    const int nwarps = NWARPS_Q5_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+    const int nwarps = NWARPS_Q5_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+    const int nwarps = NWARPS_Q5_K_AMPERE;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+    const int nwarps = NWARPS_Q5_K_PASCAL;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_K_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q6_K_RDNA2  64
+#define  MMQ_Y_Q6_K_RDNA2  128
+#define NWARPS_Q6_K_RDNA2  8
+#define  MMQ_X_Q6_K_RDNA1  32
+#define  MMQ_Y_Q6_K_RDNA1  64
+#define NWARPS_Q6_K_RDNA1  8
+#define  MMQ_X_Q6_K_AMPERE 64
+#define  MMQ_Y_Q6_K_AMPERE 64
+#define NWARPS_Q6_K_AMPERE 4
+#define  MMQ_X_Q6_K_PASCAL 64
+#define  MMQ_Y_Q6_K_PASCAL 64
+#define NWARPS_Q6_K_PASCAL 8
+
+template <bool need_check> static __global__ void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif __CUDA_ARCH__ < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+    const int nwarps = NWARPS_Q6_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+    const int nwarps = NWARPS_Q6_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+    const int nwarps = NWARPS_Q6_K_AMPERE;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif __CUDA_ARCH__ >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+    const int nwarps = NWARPS_Q6_K_PASCAL;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q6_K_q8_1_mul_mat;
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
+
+        const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = threadIdx.x;
+
+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+// partial sum for each thread
+#ifdef GGML_CUDA_F16
+    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_CUDA_F16
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel(vx, ib, iqs + j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_CUDA_F16
+            tmp += __hmul2(v, {
+                y[iybs + iqs + j/qr + 0],
+                y[iybs + iqs + j/qr + y_offset]
+            });
+#else
+            tmp += v.x * y[iybs + iqs + j/qr + 0];
+            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
+#endif // GGML_CUDA_F16
+        }
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+#ifdef GGML_CUDA_F16
+        dst[row] = tmp.x + tmp.y;
+#else
+        dst[row] = tmp;
+#endif // GGML_CUDA_F16
+    }
+}
+
+static __global__ void mul_mat_p021_f16_f32(
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
+
+    const half * x = (const half *) vx;
+
+    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+    const int channel_x = channel / (nchannels_y / nchannels_x);
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
+        const float xi = __half2float(x[ix]);
+
+        const int row_y = col_x;
+
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
+
+    const half * x = (const half *) vx;
+
+    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+    const int channel_x = channel / channel_x_divisor;
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const float xi = __half2float(x[ix]);
+
+        const int row_y = col_x;
+
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    half * dsti = (half *) cdsti;
+
+    *dsti = __float2half(*xi);
+}
+
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // then combine those indices with the corresponding byte offsets to get the total offsets
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = i - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = i - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+// rope == RoPE == rotary positional embedding
+
+template<typename T, bool has_pos>
+static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                            const int p_delta_rows, const float theta_scale) {
+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float p0 = p*freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + 1];
+
+    dst[i + 0] = x0*cos_theta - x1*sin_theta;
+    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+template<typename T, bool has_pos>
+static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                 const int p_delta_rows, const float theta_scale) {
+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i = row*ncols + col/2;
+    const int i2 = row/p_delta_rows;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float p0 = p*freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + ncols/2];
+
+    dst[i + 0]       = x0*cos_theta - x1*sin_theta;
+    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
+}
+
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                    const int p_delta_rows, const float theta_scale, const int n_ctx) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+    const int half_n_dims = ncols/4;
+
+    if (col >= half_n_dims) {
+        return;
+    }
+
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const float col_theta_scale = powf(theta_scale, col);
+     // FIXME: this is likely wrong
+    const int p = pos != nullptr ? pos[i2] : 0;
+
+    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+
+    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
+    const float sin_block_theta = sinf(block_theta);
+    const float cos_block_theta = cosf(block_theta);
+
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
+
+static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
+                                 const int n_heads_log2_floor, const float m0, const float m1) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+
+    const int k = row/k_rows;
+
+    float m_k;
+    if (k < n_heads_log2_floor) {
+        m_k = powf(m0, k + 1);
+    } else {
+        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+    }
+
+    dst[i] = col * m_k + x[i];
+}
+
+static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
+    const int col = blockDim.y*blockIdx.y + threadIdx.y;
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    // dst[i] = col > n_past + row ? -INFINITY : x[i];
+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
+}
+
+// the CUDA soft max implementation differs from the CPU implementation
+// instead of doubles floats are used
+static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int block_size = blockDim.y;
+    const int tid = threadIdx.y;
+
+    float max_val = -INFINITY;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int i = row*ncols + col;
+        max_val = max(max_val, x[i]);
+    }
+
+    // find the max value in the block
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
+    }
+
+    float tmp = 0.f;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int i = row*ncols + col;
+        const float val = expf(x[i] - max_val);
+        tmp += val;
+        dst[i] = val;
+    }
+
+    // sum up partial sums
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    const float inv_tmp = 1.f / tmp;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int i = row*ncols + col;
+        dst[i] *= inv_tmp;
+    }
+}
+
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+
+template<int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(block_num_x, nrows, 1);
+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
+}
+
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
+    const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
+}
+
+static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+}
+
+static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
+    const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
+    mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
+}
+
+static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+    }
+}
+
+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    }
+}
+
+static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
+    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const dim3 num_blocks(block_num_x, ky, 1);
+    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+template<typename dst_t>
+static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
+#else
+    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const dim3 block_dims(32, 1, 1);
+    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(32, ny, 1);
+    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK4_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK4_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK5_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK5_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK8_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    dequantize_mul_mat_vec<1, 1, convert_f16>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+}
+
+static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_row_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_row_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_row_q8_0_cuda;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F32:
+            return convert_fp32_to_fp16_cuda;
+        default:
+            return nullptr;
+    }
+}
+
+static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_row_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_row_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_row_q8_0_cuda;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F16:
+            return convert_fp16_to_fp32_cuda;
+        default:
+            return nullptr;
+    }
+}
+
+static void ggml_mul_mat_q4_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+        nwarps = NWARPS_Q4_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+        nwarps = NWARPS_Q4_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+        nwarps = NWARPS_Q4_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+        nwarps = NWARPS_Q4_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q4_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+        nwarps = NWARPS_Q4_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+        nwarps = NWARPS_Q4_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+        nwarps = NWARPS_Q4_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+        nwarps = NWARPS_Q4_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q5_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+        nwarps = NWARPS_Q5_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+        nwarps = NWARPS_Q5_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+        nwarps = NWARPS_Q5_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+        nwarps = NWARPS_Q5_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q5_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+        nwarps = NWARPS_Q5_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+        nwarps = NWARPS_Q5_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+        nwarps = NWARPS_Q5_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+        nwarps = NWARPS_Q5_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q8_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+        nwarps = NWARPS_Q8_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+        nwarps = NWARPS_Q8_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q8_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+        nwarps = NWARPS_Q8_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q8_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+        nwarps = NWARPS_Q8_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q2_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+        nwarps = NWARPS_Q2_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+        nwarps = NWARPS_Q2_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q2_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+        nwarps = NWARPS_Q2_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q2_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+        nwarps = NWARPS_Q2_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q3_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+#if QK_K == 256
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+        nwarps = NWARPS_Q3_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+        nwarps = NWARPS_Q3_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q3_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+        nwarps = NWARPS_Q3_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q3_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+        nwarps = NWARPS_Q3_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+#endif
+}
+
+static void ggml_mul_mat_q4_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+        nwarps = NWARPS_Q4_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+        nwarps = NWARPS_Q4_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+        nwarps = NWARPS_Q4_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+        nwarps = NWARPS_Q4_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q5_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+        nwarps = NWARPS_Q5_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+        nwarps = NWARPS_Q5_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+        nwarps = NWARPS_Q5_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+        nwarps = NWARPS_Q5_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_q6_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+        nwarps = NWARPS_Q6_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+        nwarps = NWARPS_Q6_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q6_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+        nwarps = NWARPS_Q6_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q6_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+        nwarps = NWARPS_Q6_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    } else {
+        const bool need_check = true;
+        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    }
+}
+
+static void ggml_mul_mat_p021_f16_f32_cuda(
+    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
+    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
+
+    const dim3 block_nums(1, nrows_x, nchannels_y);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
+}
+
+static void ggml_mul_mat_vec_nc_f16_f32_cuda(
+    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
+    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
+
+    const dim3 block_nums(1, nrows_x, nchannels_y);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
+        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
+}
+
+static void ggml_cpy_f32_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void ggml_cpy_f32_f16_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+}
+
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
+}
+
+template<typename T>
+static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nrows, num_blocks_x, 1);
+    if (pos == nullptr) {
+        rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    } else {
+        rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    }
+}
+
+template<typename T>
+static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nrows, num_blocks_x, 1);
+    if (pos == nullptr) {
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    } else {
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    }
+}
+
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                              const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
+    GGML_ASSERT(ncols % 4 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
+    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
+    const dim3 block_nums(num_blocks_x, nrows, 1);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
+}
+
+static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
+                           const int k_rows, const int n_heads_log2_floor, const float m0,
+                           const float m1, cudaStream_t stream) {
+    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
+    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
+    const dim3 block_nums(num_blocks_x, nrows, 1);
+    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
+}
+
+static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
+    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
+    const dim3 block_nums(nrows_x, block_num_x, 1);
+    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
+}
+
+static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
+    const dim3 block_dims(1, WARP_SIZE, 1);
+    const dim3 block_nums(nrows_x, 1, 1);
+    soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+#ifdef DEBUG_CUDA_MALLOC
+    int nnz = 0;
+    size_t max_size = 0, tot_size = 0;
+#endif
+    size_t best_diff = 1ull << 36;
+    int ibest = -1;
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr != nullptr) {
+#ifdef DEBUG_CUDA_MALLOC
+            ++nnz;
+            tot_size += b.size;
+            if (b.size > max_size) max_size = b.size;
+#endif
+            if (b.size >= size) {
+                size_t diff = b.size - size;
+                if (diff < best_diff) {
+                    best_diff = diff;
+                    ibest = i;
+                    if (!best_diff) {
+                        void * ptr = b.ptr;
+                        *actual_size = b.size;
+                        b.ptr = nullptr;
+                        b.size = 0;
+                        return ptr;
+                    }
+                }
+            }
+        }
+    }
+    if (ibest >= 0) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
+        void * ptr = b.ptr;
+        *actual_size = b.size;
+        b.ptr = nullptr;
+        b.size = 0;
+        return ptr;
+    }
+#ifdef DEBUG_CUDA_MALLOC
+    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
+            (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
+#endif
+    void * ptr;
+    size_t look_ahead_size = (size_t) (1.05 * size);
+    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+    CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
+    *actual_size = look_ahead_size;
+    return ptr;
+}
+
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(cudaFree(ptr));
+}
+
+
+void ggml_init_cublas() {
+    static bool initialized = false;
+
+    if (!initialized) {
+
+#ifdef __HIP_PLATFORM_AMD__
+        // Workaround for a rocBLAS bug when using multiple graphics cards:
+        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+        rocblas_initialize();
+        CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
+        CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
+        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+        int64_t total_vram = 0;
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
+        for (int id = 0; id < g_device_count; ++id) {
+            cudaDeviceProp prop;
+            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+
+            g_tensor_split[id] = total_vram;
+            total_vram += prop.totalGlobalMem;
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
+#else
+            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+        }
+        for (int id = 0; id < g_device_count; ++id) {
+            g_tensor_split[id] /= total_vram;
+        }
+
+        for (int id = 0; id < g_device_count; ++id) {
+            CUDA_CHECK(ggml_cuda_set_device(id));
+
+            // create cuda streams
+            for (int is = 0; is < MAX_STREAMS; ++is) {
+                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
+            }
+
+            // create cublas handle
+            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
+            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
+        }
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+        initialized = true;
+    }
+}
+
+void ggml_cuda_set_tensor_split(const float * tensor_split) {
+    if (tensor_split == nullptr) {
+        return;
+    }
+    bool all_zero = true;
+    for (int i = 0; i < g_device_count; ++i) {
+        if (tensor_split[i] != 0.0f) {
+            all_zero = false;
+            break;
+        }
+    }
+    if (all_zero) {
+        return;
+    }
+    float split_sum = 0.0f;
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] = split_sum;
+        split_sum += tensor_split[i];
+    }
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] /= split_sum;
+    }
+}
+
+void * ggml_cuda_host_malloc(size_t size) {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        // The allocation error can be bypassed. A null ptr will assigned out of this function.
+        // This can fixed the OOM error in WSL.
+        cudaGetLastError();
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            size/1024.0/1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    return ptr;
+}
+
+void ggml_cuda_host_free(void * ptr) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+}
+
+static cudaError_t ggml_cuda_cpy_tensor_2d(
+    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
+
+    cudaMemcpyKind kind;
+    char * src_ptr;
+    if (src->backend == GGML_BACKEND_CPU) {
+        kind = cudaMemcpyHostToDevice;
+        src_ptr = (char *) src->data;
+    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
+        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+        kind = cudaMemcpyDeviceToDevice;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        int id;
+        CUDA_CHECK(cudaGetDevice(&id));
+        src_ptr = (char *) extra->data_device[id];
+    } else {
+        GGML_ASSERT(false);
+    }
+    char * dst_ptr = (char *) dst;
+
+    const int64_t ne0 = src->ne[0];
+    const int64_t nb0 = src->nb[0];
+    const int64_t nb1 = src->nb[1];
+    const int64_t nb2 = src->nb[2];
+    const int64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const int64_t ts = ggml_type_size(type);
+    const int64_t bs = ggml_blck_size(type);
+    int64_t i1_diff = i1_high - i1_low;
+
+    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
+    } else if (nb0 == ts) {
+        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
+    } else {
+        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
+            if (r != cudaSuccess) return r;
+        }
+        return cudaSuccess;
+    }
+}
+
+static void ggml_cuda_op_repeat(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t ne3 = dst->ne[3];
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+    const size_t nb2 = dst->nb[2];
+    const size_t nb3 = dst->nb[3];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+    const size_t nb02 = src0->nb[2];
+    const size_t nb03 = src0->nb[3];
+
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                CUDA_CHECK(cudaMemcpyAsync(
+                                              (char *)  dst_d + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0,
+                                        (const char *) src0_d + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01,
+                                        ne00*nb0, cudaMemcpyDeviceToDevice, stream));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    (void) src1;
+    (void) src1_d;
+}
+
+static void ggml_cuda_op_get_rows(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const int ncols = src0->ne[0];
+    const int nrows = ggml_nelements(src1);
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        default:
+            // TODO: k-quants
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+inline void ggml_cuda_op_add(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    (void) src1;
+    (void) dst;
+}
+
+inline void ggml_cuda_op_mul(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_cuda_op_gelu(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_silu(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_norm(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_rms_norm(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_mul_mat_q(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
+    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+}
+
+static int64_t get_row_rounding(ggml_type type) {
+    int64_t min_compute_capability = INT_MAX;
+    int64_t max_compute_capability = INT_MIN;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            if (min_compute_capability > g_compute_capabilities[id]) {
+                min_compute_capability = g_compute_capabilities[id];
+            }
+            if (max_compute_capability < g_compute_capabilities[id]) {
+                max_compute_capability = g_compute_capabilities[id];
+            }
+        }
+    }
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_F16:
+            return 1;
+        case GGML_TYPE_Q2_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
+        case GGML_TYPE_Q3_K:
+            return min_compute_capability < CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#else
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 64;
+        case GGML_TYPE_F16:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}
+
+inline void ggml_cuda_op_mul_mat_vec_q(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_dequantize_mul_mat_vec(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
+#ifdef GGML_CUDA_F16
+    size_t ash;
+    dfloat * src1_dfloat = nullptr; // dfloat == half
+
+    bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+
+    if (src1_convert_f16) {
+        src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
+        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
+                                ne00, 1, sizeof(float), 0, 0,
+                                ne00, 1, sizeof(half),  0, 0, stream);
+    }
+#else
+    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
+#endif // GGML_CUDA_F16
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_F16:
+            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+#ifdef GGML_CUDA_F16
+    if (src1_convert_f16) {
+        ggml_cuda_pool_free(src1_dfloat, ash);
+    }
+#endif // GGML_CUDA_F16
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_mul_mat_cublas(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, const cudaStream_t & stream) {
+
+    GGML_ASSERT(src0_dd_i != nullptr);
+    GGML_ASSERT(src1_ddf_i != nullptr);
+    GGML_ASSERT(dst_dd_i != nullptr);
+
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+        half * src0_as_f16 = nullptr;
+        size_t src0_as = 0;
+        if (src0->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = row_diff*ne00;
+            src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
+            to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
+        }
+        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
+
+        half * src1_as_f16 = nullptr;
+        size_t src1_as = 0;
+        if (src1->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
+        }
+        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
+
+        size_t dst_as = 0;
+        half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
+
+        const half alpha_f16 = 1.0f;
+        const half beta_f16 = 0.0f;
+
+        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
+        CUBLAS_CHECK(
+            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                    row_diff, src1_ncols, ne10,
+                    &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
+                                src1_ptr, CUDA_R_16F, ne10,
+                    &beta_f16,   dst_f16, CUDA_R_16F, ldc,
+                    CUBLAS_COMPUTE_16F,
+                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
+
+        ggml_cuda_pool_free(dst_f16, dst_as);
+
+        if (src0_as != 0) {
+            ggml_cuda_pool_free(src0_as_f16, src0_as);
+        }
+
+        if (src1_as != 0) {
+            ggml_cuda_pool_free(src1_as_f16, src1_as);
+        }
+    }
+    else {
+        float * src0_ddq_as_f32 = nullptr;
+        size_t src0_as = 0;
+
+        if (src0->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
+        }
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
+
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+
+        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
+        CUBLAS_CHECK(
+            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                    row_diff, src1_ncols, ne10,
+                    &alpha, src0_ddf_i, ne00,
+                            src1_ddf_i,  ne10,
+                    &beta,  dst_dd_i,   ldc));
+
+        if (src0_as != 0) {
+            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
+        }
+    }
+
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_rope(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3];
+    // RoPE alteration for extended context
+
+    float freq_base, freq_scale;
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    const int32_t * pos = nullptr;
+    if ((mode & 1) == 0) {
+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->ne[0] == ne2);
+        pos = (const int32_t *) src1_dd;
+    }
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    // compute
+    if (is_glm) {
+        GGML_ASSERT(false);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
+    } else if (is_neox) {
+        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    } else {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_alibi(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    GGML_ASSERT(ne01 + n_past == ne00);
+    GGML_ASSERT(n_head == ne02);
+
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
+
+    (void) src1;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_diag_mask_inf(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int nrows0 = ggml_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+
+    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_soft_max(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_scale(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float scale;
+    // HACK: support for ggml backend interface
+    if (src1->backend == GGML_BACKEND_CPU) {
+        scale = ((float *) src1->data)[0];
+    } else {
+        // TODO: pass pointer to kernel instead of copying to host
+        CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
+    }
+
+    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
+    CUDA_CHECK(cudaGetLastError());
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const bool use_src1 = src1 != nullptr;
+    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
+
+    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
+
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
+    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
+
+    const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
+
+    // dd = data device
+    float * src0_ddf = nullptr;
+    float * src1_ddf = nullptr;
+    float *  dst_ddf = nullptr;
+
+    // as = actual size
+    size_t src0_asf = 0;
+    size_t src1_asf = 0;
+    size_t  dst_asf = 0;
+
+    ggml_cuda_set_device(g_main_device);
+    const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    if (src0_on_device) {
+        src0_ddf = (float *) src0_extra->data_device[g_main_device];
+    } else {
+        src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
+        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
+    }
+
+    if (use_src1 && !src1_stays_on_host) {
+        if (src1_on_device) {
+            src1_ddf = (float *) src1_extra->data_device[g_main_device];
+        } else {
+            src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
+            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+        }
+    }
+    if (dst_on_device) {
+        dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    } else {
+        dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
+    }
+
+    // do the computation
+    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    CUDA_CHECK(cudaGetLastError());
+
+    // copy dst to host if necessary
+    if (!dst_on_device) {
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
+    }
+
+    if (src0_asf > 0) {
+        ggml_cuda_pool_free(src0_ddf, src0_asf);
+    }
+    if (src1_asf > 0) {
+        ggml_cuda_pool_free(src1_ddf, src1_asf);
+    }
+    if (dst_asf > 0) {
+        ggml_cuda_pool_free(dst_ddf, dst_asf);
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
+
+static void ggml_cuda_set_peer_access(const int n_tokens) {
+    static bool peer_access_enabled = false;
+
+    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
+
+    if (peer_access_enabled == enable_peer_access) {
+        return;
+    }
+
+#ifdef NDEBUG
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        for (int id_other = 0; id_other < g_device_count; ++id_other) {
+            if (id == id_other) {
+                continue;
+            }
+            if (id != g_main_device && id_other != g_main_device) {
+                continue;
+            }
+
+            int can_access_peer;
+            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            if (can_access_peer) {
+                if (enable_peer_access) {
+                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+                } else {
+                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+                }
+            }
+        }
+    }
+#endif // NDEBUG
+
+    peer_access_enabled = enable_peer_access;
+}
+
+static void ggml_cuda_op_mul_mat(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
+    const bool convert_src1_to_q8_1) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int64_t nrows1 = ggml_nrows(src1);
+
+    GGML_ASSERT(ne03 == ne13);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int nb2 = dst->nb[2];
+    const int nb3 = dst->nb[3];
+
+    ggml_cuda_set_peer_access(ne11);
+
+    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
+
+    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
+
+    const int64_t i02_divisor = ne12 / ne02;
+
+    const size_t src0_ts = ggml_type_size(src0->type);
+    const size_t src0_bs = ggml_blck_size(src0->type);
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src0_is_contiguous = ggml_is_contiguous(src0);
+
+    const bool src1_is_contiguous = ggml_is_contiguous(src1);
+    const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
+        ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+    GGML_ASSERT(!(split && ne02 > 1));
+    GGML_ASSERT(!(split && ne03 > 1));
+    GGML_ASSERT(!(split && ne02 < ne12));
+
+    // dd = data device
+    char  *  src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
+    char  * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
+    float *   dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+    // as = actual size
+    size_t  src0_as[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t   dst_as[GGML_CUDA_MAX_DEVICES] = {0};
+
+    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
+    int64_t row_high[GGML_CUDA_MAX_DEVICES];
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        // by default, use all rows
+        row_low[id]  = 0;
+        row_high[id] = ne01;
+
+        // for multi GPU, get the row boundaries from tensor split
+        // and round to mul_mat_q tile sizes
+        if (split) {
+            const int64_t rounding = get_row_rounding(src0->type);
+
+            if (id != 0) {
+                row_low[id]  = ne01*g_tensor_split[id];
+                row_low[id] -= row_low[id] % rounding;
+            }
+
+            if (id != g_device_count - 1) {
+                row_high[id]  = ne01*g_tensor_split[id + 1];
+                row_high[id] -= row_high[id] % rounding;
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
+
+        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+
+        ggml_cuda_set_device(id);
+        const cudaStream_t stream = g_cudaStreams[id][0];
+
+        if (src0_on_device && src0_is_contiguous) {
+            src0_dd[id] = (char *) src0_extra->data_device[id];
+        } else {
+            const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+        }
+
+        if (src1_on_device && src1_is_contiguous) {
+            src1_ddf[id] = (float *) src1_extra->data_device[id];
+        } else {
+            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
+        }
+
+        if (convert_src1_to_q8_1) {
+            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
+                CUDA_CHECK(cudaGetLastError());
+            }
+        }
+
+        if (dst_on_device) {
+            dst_dd[id] = (float *) dst_extra->data_device[id];
+        } else {
+            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
+            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
+        }
+    }
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signals that the main device has finished calculating the input data
+    if (split && g_device_count > 1) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
+    }
+
+    const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
+        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
+        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
+
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+                continue;
+            }
+
+            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const int64_t row_diff = row_high[id] - row_low[id];
+
+            ggml_cuda_set_device(id);
+            const cudaStream_t stream = g_cudaStreams[id][is];
+
+            // wait for main GPU data if necessary
+            if (split && (id != g_main_device || is != 0)) {
+                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
+            }
+
+            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
+                const int64_t i03 = i0 / ne12;
+                const int64_t i02 = i0 % ne12;
+
+                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
+                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
+                    if (id != g_main_device) {
+                        if (convert_src1_to_q8_1) {
+                            char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
+                            CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
+                                                    cudaMemcpyDeviceToDevice, stream));
+                        } else {
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
+                            CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
+                                                    cudaMemcpyDeviceToDevice, stream));
+                        }
+                    }
+                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                } else {
+                    GGML_ASSERT(false);
+                }
+
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
+                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    CUDA_CHECK(cudaGetLastError());
+                }
+
+                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
+                }
+
+                // do the computation
+                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
+                   row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
+                CUDA_CHECK(cudaGetLastError());
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device;
+                    cudaMemcpyKind kind;
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        dst_off_device = dst->data;
+                        kind = cudaMemcpyDeviceToHost;
+                    } else if (dst->backend == GGML_BACKEND_GPU) {
+                        dst_off_device = dst_extra->data_device[g_main_device];
+                        kind = cudaMemcpyDeviceToDevice;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0 + row_low[id];
+                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
+                                                    row_diff*sizeof(float), src1_ncols, kind, stream));
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0;
+                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
+                    }
+                }
+
+                // add event for the main device to wait on until other device is done
+                if (split && (id != g_main_device || is != 0)) {
+                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
+                }
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        // free buffers again when done
+        if (src0_as[id] > 0) {
+            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
+        }
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        }
+        if (src1_asq[id] > 0) {
+            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
+        }
+        if (dst_as[id] > 0) {
+            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && g_device_count > 1) {
+        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
+        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
+
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            for (int64_t is = 0; is < is_max; ++is) {
+                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
+
+static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+}
+
+static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+}
+
+static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
+}
+
+static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
+}
+
+static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
+}
+
+static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
+}
+
+static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
+}
+
+static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
+}
+
+bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+            src1->type == GGML_TYPE_F32 &&
+             dst->type == GGML_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
+    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
+    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+}
+
+static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
+    GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
+    GGML_ASSERT(!ggml_is_permuted(src0));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    const int64_t row_stride_x = nb01 / sizeof(half);
+    const int64_t channel_stride_x = nb02 / sizeof(half);
+
+    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+}
+
+static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
+        src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
+
+    int64_t min_compute_capability = INT_MAX;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (min_compute_capability > g_compute_capabilities[id]
+                && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            min_compute_capability = g_compute_capabilities[id];
+        }
+    }
+
+    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
+    } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
+        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
+    } else if (src0->type == GGML_TYPE_F32) {
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+
+#ifdef GGML_CUDA_FORCE_DMMV
+            const bool use_mul_mat_vec_q = false;
+#else
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+#endif // GGML_CUDA_FORCE_DMMV
+
+            if (use_mul_mat_vec_q) {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+            }
+        } else {
+            if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+            }
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
+}
+
+static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne == ggml_nelements(src1));
+
+    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+
+    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
+    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    GGML_ASSERT(src0->ne[3] == 1);
+
+    const int64_t nb00 = src0->nb[0];
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    GGML_ASSERT(src1->ne[3] == 1);
+
+    const int64_t nb10 = src1->nb[0];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+
+    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
+
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
+                              ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
+                              ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ASSERT(false);
+    }
+
+    (void) dst;
+}
+
+static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
+static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
+}
+
+static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
+}
+
+static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
+}
+
+static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
+}
+
+static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    (void) src0;
+    (void) src1;
+    (void) dst;
+}
+
+void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
+    const int64_t nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
+    const size_t nb1 = tensor->nb[1];
+
+    ggml_backend_type backend = tensor->backend;
+    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+    memset(extra, 0, sizeof(*extra));
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
+            continue;
+        }
+
+        ggml_cuda_set_device(id);
+
+        int64_t row_low, row_high;
+        if (backend == GGML_BACKEND_GPU) {
+            row_low = 0;
+            row_high = nrows;
+        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+            const int64_t rounding = get_row_rounding(tensor->type);
+
+            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
+            row_low -= row_low % rounding;
+
+            if (id == g_device_count - 1) {
+                row_high = nrows;
+            } else {
+                row_high = nrows*g_tensor_split[id + 1];
+                row_high -= row_high % rounding;
+            }
+        } else {
+            GGML_ASSERT(false);
+        }
+        if (row_low == row_high) {
+            continue;
+        }
+
+        int64_t nrows_split = row_high - row_low;
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
+
+        char * buf;
+        CUDA_CHECK(cudaMalloc(&buf, size));
+        char * buf_host = (char*)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+
+        CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
+
+        extra->data_device[id] = buf;
+
+        if (backend == GGML_BACKEND_GPU_SPLIT) {
+            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+                CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+            }
+        }
+    }
+
+    tensor->extra = extra;
+}
+
+void ggml_cuda_free_data(struct ggml_tensor * tensor) {
+    if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (extra->data_device[id] != nullptr) {
+            CUDA_CHECK(ggml_cuda_set_device(id));
+            CUDA_CHECK(cudaFree(extra->data_device[id]));
+        }
+
+        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            if (extra->events[id][is] != nullptr) {
+                CUDA_CHECK(ggml_cuda_set_device(id));
+                CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+            }
+        }
+    }
+
+    delete extra;
+}
+
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+    }
+
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+
+    return extra;
+}
+
+static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
+    if (scratch && g_scratch_size == 0) {
+        return;
+    }
+
+    tensor->backend = GGML_BACKEND_GPU;
+
+    // recursively assign CUDA buffers until a compute tensor is found
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
+        const ggml_op src0_op = tensor->src[0]->op;
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
+            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
+        }
+    }
+    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
+        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
+    }
+
+    if (scratch && no_alloc) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra;
+
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+        tensor->op == GGML_OP_VIEW ||
+        force_inplace;
+    const size_t size = ggml_nbytes(tensor);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&offset, tensor->op_params, sizeof(size_t));
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src0_ddc + offset;
+    } else if (tensor->op == GGML_OP_CPY) {
+        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src1_ddv;
+    } else if (scratch) {
+        GGML_ASSERT(size <= g_scratch_size);
+        if (g_scratch_offset + size > g_scratch_size) {
+            g_scratch_offset = 0;
+        }
+
+        char * data = (char *) g_scratch_buffer;
+        if (data == nullptr) {
+            CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
+            g_scratch_buffer = data;
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = data + g_scratch_offset;
+
+        g_scratch_offset += size;
+
+        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
+    } else { // allocate new buffers outside of scratch
+        void * data;
+        CUDA_CHECK(cudaMalloc(&data, size));
+        CUDA_CHECK(cudaMemset(data, 0, size));
+        extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
+        extra->data_device[g_main_device] = data;
+    }
+
+    tensor->extra = extra;
+}
+
+void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
+    if (g_scratch_size == 0) {
+        return;
+    }
+    if (g_scratch_buffer == nullptr) {
+        ggml_cuda_set_device(g_main_device);
+        CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
+    }
+
+    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+        tensor->op == GGML_OP_VIEW;
+
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t view_offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
+        }
+        extra->data_device[g_main_device] = src0_ddc + view_offset;
+    } else {
+        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
+    }
+
+    tensor->extra = extra;
+}
+
+void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
+}
+
+void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
+}
+
+void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
+}
+
+void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
+}
+
+void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
+}
+
+void ggml_cuda_set_main_device(const int main_device) {
+    if (main_device >= g_device_count) {
+        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+                main_device, g_device_count, g_main_device);
+        return;
+    }
+    g_main_device = main_device;
+    if (g_device_count > 1) {
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
+        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
+    }
+}
+
+void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
+    g_mul_mat_q = mul_mat_q;
+}
+
+void ggml_cuda_set_scratch_size(const size_t scratch_size) {
+    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+    // it still won't always work as expected, but it's better than nothing
+    if (scratch_size > g_scratch_size) {
+        ggml_cuda_free_scratch();
+    }
+    g_scratch_size = std::max(g_scratch_size, scratch_size);
+}
+
+void ggml_cuda_free_scratch() {
+    if (g_scratch_buffer == nullptr) {
+        return;
+    }
+
+    CUDA_CHECK(cudaFree(g_scratch_buffer));
+    g_scratch_buffer = nullptr;
+}
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    ggml_cuda_func_t func;
+    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+
+    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
+        return false;
+    }
+
+    switch (tensor->op) {
+        case GGML_OP_REPEAT:
+            func = ggml_cuda_repeat;
+            break;
+        case GGML_OP_GET_ROWS:
+            func = ggml_cuda_get_rows;
+            break;
+        case GGML_OP_DUP:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_ADD:
+            func = ggml_cuda_add;
+            break;
+        case GGML_OP_MUL:
+            func = ggml_cuda_mul;
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_GELU:
+                    func = ggml_cuda_gelu;
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    func = ggml_cuda_silu;
+                    break;
+                default:
+                    return false;
+            } break;
+        case GGML_OP_NORM:
+            func = ggml_cuda_norm;
+            break;
+        case GGML_OP_RMS_NORM:
+            func = ggml_cuda_rms_norm;
+            break;
+        case GGML_OP_MUL_MAT:
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cuda_mul_mat;
+            break;
+        case GGML_OP_SCALE:
+            func = ggml_cuda_scale;
+            break;
+        case GGML_OP_CPY:
+            func = ggml_cuda_cpy;
+            break;
+        case GGML_OP_CONT:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            func = ggml_cuda_nop;
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            func = ggml_cuda_diag_mask_inf;
+            break;
+        case GGML_OP_SOFT_MAX:
+            func = ggml_cuda_soft_max;
+            break;
+        case GGML_OP_ROPE:
+            func = ggml_cuda_rope;
+            break;
+        case GGML_OP_ALIBI:
+            func = ggml_cuda_alibi;
+            break;
+        default:
+            return false;
+    }
+
+    if (params->ith != 0) {
+        return true;
+    }
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return true;
+    }
+    func(tensor->src[0], tensor->src[1], tensor);
+    return true;
+}
+
+int ggml_cuda_get_device_count() {
+    int device_count;
+    CUDA_CHECK(cudaGetDeviceCount(&device_count));
+    return device_count;
+}
+
+void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    snprintf(description, description_size, "%s", prop.name);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#define UNUSED GGML_UNUSED
+
+struct ggml_backend_context_cuda {
+};
+
+static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+    return GGML_CUDA_NAME;
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+    delete cuda_ctx;
+    delete backend;
+}
+
+struct ggml_backend_buffer_context_cuda {
+    void * device;
+
+    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
+    size_t temp_tensor_extra_index = 0;
+
+    ~ggml_backend_buffer_context_cuda() {
+        delete[] temp_tensor_extras;
+    }
+
+    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+        if (temp_tensor_extras == nullptr) {
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+        }
+
+        size_t alloc_index = temp_tensor_extra_index;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
+        memset(extra, 0, sizeof(*extra));
+
+        return extra;
+    }
+};
+
+static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    CUDA_CHECK(cudaFree(ctx->device));
+    delete ctx;
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    return ctx->device;
+}
+
+static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    int64_t row_low = 0;
+    int64_t row_high = ggml_nrows(tensor);
+    int64_t nrows_split = row_high - row_low;
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
+    }
+
+    return size;
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        assert(tensor->view_src->buffer->backend == buffer->backend);
+        tensor->backend = tensor->view_src->backend;
+        tensor->extra = tensor->view_src->extra;
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+
+    extra->data_device[g_main_device] = tensor->data;
+
+    tensor->backend = GGML_BACKEND_GPU;
+    tensor->extra = extra;
+
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        int64_t row_low = 0;
+        int64_t row_high = ggml_nrows(tensor);
+        int64_t nrows_split = row_high - row_low;
+
+        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
+        }
+    }
+
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+    /* .free_buffer    = */ ggml_backend_cuda_buffer_free_buffer,
+    /* .get_base       = */ ggml_backend_cuda_buffer_get_base,
+    /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
+    /* .init_tensor    = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .free_tensor    = */ NULL,
+};
+
+static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
+    ggml_cuda_set_device(g_main_device);
+
+    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
+    CUDA_CHECK(cudaMalloc(&ctx->device, size));
+    return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
+    return 128;
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    GGML_ASSERT(!"not implemented");
+
+    return nullptr;
+
+    UNUSED(backend);
+    UNUSED(cgraph);
+}
+
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_cuda_set_device(g_main_device);
+
+    ggml_compute_params params = {};
+    params.type = GGML_TASK_COMPUTE;
+    params.ith = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        assert(node->backend == GGML_BACKEND_GPU);
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->backend == GGML_BACKEND_GPU);
+            }
+        }
+
+        bool ok = ggml_cuda_compute_forward(&params, node);
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+
+#if 0
+        if (node->type == GGML_TYPE_F32) {
+            cudaDeviceSynchronize();
+            std::vector<float> tmp(ggml_nelements(node), 0.0f);
+            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
+                ggml_type_name(node->src[0]->type),
+                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
+                node->src[0]->name,
+                node->src[1] ? node->src[1]->name : "none");
+            double sum = 0.0;
+            double sq_sum = 0.0;
+            for (int i = 0; i < ggml_nelements(node); i++) {
+                printf("%f ", tmp[i]);
+                sum += tmp[i];
+                sq_sum += tmp[i]*tmp[i];
+            }
+            printf("\n");
+            printf("sum: %f, ", sum);
+            printf("sq_sum: %f\n", sq_sum);
+        }
+#endif
+    }
+
+    UNUSED(backend);
+}
+
+static ggml_backend_i cuda_backend_i = {
+    /* .get_name            = */ ggml_backend_cuda_name,
+    /* .free                = */ ggml_backend_cuda_free,
+    /* .alloc_buffer        = */ ggml_backend_cuda_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_cuda_get_alignment,
+    /* .set_tensor_async    = */ ggml_backend_cuda_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_cuda_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_cuda_synchronize,
+    /* .cpy_tensor_from     = */ nullptr,
+    /* .cpy_tensor_to       = */ nullptr,
+    /* .graph_plan_create   = */ ggml_backend_cuda_graph_plan_create,
+    /* .graph_plan_free     = */ ggml_backend_cuda_graph_plan_free,
+    /* .graph_plan_compute  = */ ggml_backend_cuda_graph_plan_compute,
+    /* .graph_compute       = */ ggml_backend_cuda_graph_compute,
+    /* .supports_op         = */ nullptr,
+};
+
+ggml_backend_t ggml_backend_cuda_init() {
+    ggml_init_cublas(); // TODO: remove from ggml.c
+
+    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
+
+    ggml_backend_t cuda_backend = new ggml_backend {
+        /* .interface = */ cuda_backend_i,
+        /* .context   = */ ctx
+    };
+
+    return cuda_backend;
+}
diff --git a/stable-diffusion.cpp/ggml/src/ggml-cuda.h b/stable-diffusion.cpp/ggml/src/ggml-cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..57adc9cf34bc5bc4fae4d576b4c2b1572364b8ff
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-cuda.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_HIPBLAS
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_CUDA_MAX_DEVICES       16
+
+GGML_API void   ggml_init_cublas(void);
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_host_free(void * ptr);
+
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_set_main_device(int main_device);
+GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_cuda_free_scratch(void);
+GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_cuda_get_device_count(void);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+
+// backend API
+GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/stable-diffusion.cpp/ggml/src/ggml-metal.h b/stable-diffusion.cpp/ggml/src/ggml-metal.h
new file mode 100644
index 0000000000000000000000000000000000000000..096b844e32c6fef91e18033c4638acecc2f322c8
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-metal.h
@@ -0,0 +1,106 @@
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stddef.h>
+#include <stdbool.h>
+
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_COMMAND_BUFFERS 32
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// internal API
+// temporary exposed to user-code
+//
+
+struct ggml_metal_context;
+
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+
+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+// - max_size specifies the maximum size of a tensor and is used to create shared views such
+//   that it is guaranteed that the tensor will fit in at least one of the views
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size);
+
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// try to find operations that can be run concurrently in the graph
+// you should run it again if the topology of your graph changes
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
+
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
+
+// same as ggml_graph_compute but uses Metal
+// creates gf->n_threads command buffers in parallel
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
+GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/stable-diffusion.cpp/ggml/src/ggml-metal.m b/stable-diffusion.cpp/ggml/src/ggml-metal.m
new file mode 100644
index 0000000000000000000000000000000000000000..29cb3c922daeba4e912f1b2defcca6fbc30e18f4
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-metal.m
@@ -0,0 +1,1595 @@
+#import "ggml-metal.h"
+
+#import "ggml.h"
+
+#import <Foundation/Foundation.h>
+
+#import <Metal/Metal.h>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#ifdef GGML_METAL_NDEBUG
+#define GGML_METAL_LOG_INFO(...)
+#define GGML_METAL_LOG_WARN(...)
+#define GGML_METAL_LOG_ERROR(...)
+#else
+#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
+#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#endif
+
+#define UNUSED(x) (void)(x)
+
+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
+
+struct ggml_metal_buffer {
+    const char * name;
+
+    void   * data;
+    size_t   size;
+
+    id<MTLBuffer> metal;
+};
+
+struct ggml_metal_context {
+    int n_cb;
+
+    id<MTLDevice>       device;
+    id<MTLCommandQueue> queue;
+    id<MTLLibrary>      library;
+
+    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
+    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
+
+    dispatch_queue_t d_queue;
+
+    int n_buffers;
+    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+
+    int concur_list[GGML_MAX_CONCUR];
+    int concur_list_len;
+
+    // custom kernels
+#define GGML_METAL_DECL_KERNEL(name) \
+    id<MTLFunction>             function_##name; \
+    id<MTLComputePipelineState> pipeline_##name
+
+    GGML_METAL_DECL_KERNEL(add);
+    GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast
+    GGML_METAL_DECL_KERNEL(mul);
+    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
+    GGML_METAL_DECL_KERNEL(scale);
+    GGML_METAL_DECL_KERNEL(silu);
+    GGML_METAL_DECL_KERNEL(relu);
+    GGML_METAL_DECL_KERNEL(gelu);
+    GGML_METAL_DECL_KERNEL(soft_max);
+    GGML_METAL_DECL_KERNEL(soft_max_4);
+    GGML_METAL_DECL_KERNEL(diag_mask_inf);
+    GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
+    GGML_METAL_DECL_KERNEL(get_rows_f32);
+    GGML_METAL_DECL_KERNEL(get_rows_f16);
+    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
+    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DECL_KERNEL(get_rows_q8_0);
+    GGML_METAL_DECL_KERNEL(get_rows_q2_K);
+    GGML_METAL_DECL_KERNEL(get_rows_q3_K);
+    GGML_METAL_DECL_KERNEL(get_rows_q4_K);
+    GGML_METAL_DECL_KERNEL(get_rows_q5_K);
+    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DECL_KERNEL(rms_norm);
+    GGML_METAL_DECL_KERNEL(norm);
+    GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(rope_f32);
+    GGML_METAL_DECL_KERNEL(rope_f16);
+    GGML_METAL_DECL_KERNEL(alibi_f32);
+    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
+    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
+    GGML_METAL_DECL_KERNEL(cpy_f16_f16);
+    GGML_METAL_DECL_KERNEL(concat);
+    GGML_METAL_DECL_KERNEL(sqr);
+
+#undef GGML_METAL_DECL_KERNEL
+};
+
+// MSL code
+// TODO: move the contents here when ready
+//       for now it is easier to work in a separate file
+static NSString * const msl_library_source = @"see metal.metal";
+
+// Here to assist with NSBundle Path Hack
+@interface GGMLMetalClass : NSObject
+@end
+@implementation GGMLMetalClass
+@end
+
+ggml_log_callback ggml_metal_log_callback = NULL;
+void * ggml_metal_log_user_data = NULL;
+
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
+    ggml_metal_log_callback  = log_callback;
+    ggml_metal_log_user_data = user_data;
+}
+
+static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
+    if (ggml_metal_log_callback != NULL) {
+        va_list args;
+        va_start(args, format);
+        char buffer[128];
+        int len = vsnprintf(buffer, 128, format, args);
+        if (len < 128) {
+            ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
+        } else {
+            char* buffer2 = malloc(len+1);
+            vsnprintf(buffer2, len+1, format, args);
+            buffer2[len] = 0;
+            ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
+            free(buffer2);
+        }
+        va_end(args);
+    }
+}
+
+
+
+struct ggml_metal_context * ggml_metal_init(int n_cb) {
+    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
+
+    id <MTLDevice> device;
+    NSString * s;
+
+#if TARGET_OS_OSX
+    // Show all the Metal device instances in the system
+    NSArray * devices = MTLCopyAllDevices();
+    for (device in devices) {
+        s = [device name];
+        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
+    }
+#endif
+
+    // Pick and show default Metal device
+    device = MTLCreateSystemDefaultDevice();
+    s = [device name];
+    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
+
+    // Configure context
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+    ctx->device = device;
+    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
+    ctx->queue  = [ctx->device newCommandQueue];
+    ctx->n_buffers = 0;
+    ctx->concur_list_len = 0;
+
+    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
+
+    // load library
+    {
+        NSBundle * bundle = nil;
+#ifdef SWIFT_PACKAGE
+        bundle = SWIFTPM_MODULE_BUNDLE;
+#else
+        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
+#endif
+        NSError * error = nil;
+        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
+        if (libPath != nil) {
+            NSURL * libURL = [NSURL fileURLWithPath:libPath];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
+            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
+        } else {
+            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
+
+            NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
+            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
+            if (error) {
+                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return NULL;
+            }
+
+            MTLCompileOptions* options = nil;
+#ifdef GGML_QKK_64
+            options = [MTLCompileOptions new];
+            options.preprocessorMacros = @{ @"QK_K" : @(64) };
+#endif
+            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+        }
+
+        if (error) {
+            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            return NULL;
+        }
+    }
+
+    // load kernels
+    {
+        NSError * error = nil;
+#define GGML_METAL_ADD_KERNEL(name) \
+        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+        GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
+                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
+                (int) ctx->pipeline_##name.threadExecutionWidth); \
+        if (error) { \
+          GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            return NULL; \
+        }
+
+        GGML_METAL_ADD_KERNEL(add);
+        GGML_METAL_ADD_KERNEL(add_row);
+        GGML_METAL_ADD_KERNEL(mul);
+        GGML_METAL_ADD_KERNEL(mul_row);
+        GGML_METAL_ADD_KERNEL(scale);
+        GGML_METAL_ADD_KERNEL(silu);
+        GGML_METAL_ADD_KERNEL(relu);
+        GGML_METAL_ADD_KERNEL(gelu);
+        GGML_METAL_ADD_KERNEL(soft_max);
+        GGML_METAL_ADD_KERNEL(soft_max_4);
+        GGML_METAL_ADD_KERNEL(diag_mask_inf);
+        GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
+        GGML_METAL_ADD_KERNEL(get_rows_f32);
+        GGML_METAL_ADD_KERNEL(get_rows_f16);
+        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
+        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
+        GGML_METAL_ADD_KERNEL(get_rows_q8_0);
+        GGML_METAL_ADD_KERNEL(get_rows_q2_K);
+        GGML_METAL_ADD_KERNEL(get_rows_q3_K);
+        GGML_METAL_ADD_KERNEL(get_rows_q4_K);
+        GGML_METAL_ADD_KERNEL(get_rows_q5_K);
+        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
+        GGML_METAL_ADD_KERNEL(rms_norm);
+        GGML_METAL_ADD_KERNEL(norm);
+        GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
+        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
+            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
+        }
+        GGML_METAL_ADD_KERNEL(rope_f32);
+        GGML_METAL_ADD_KERNEL(rope_f16);
+        GGML_METAL_ADD_KERNEL(alibi_f32);
+        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
+        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
+        GGML_METAL_ADD_KERNEL(cpy_f16_f16);
+        GGML_METAL_ADD_KERNEL(concat);
+        GGML_METAL_ADD_KERNEL(sqr);
+
+#undef GGML_METAL_ADD_KERNEL
+    }
+
+#if TARGET_OS_OSX
+    // print MTL GPU family:
+    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
+
+    // determine max supported GPU family
+    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+    for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
+        if ([ctx->device supportsFamily:i]) {
+            GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
+            break;
+        }
+    }
+
+    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    if (ctx->device.maxTransferRate != 0) {
+        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+    } else {
+        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
+    }
+#endif
+
+    return ctx;
+}
+
+void ggml_metal_free(struct ggml_metal_context * ctx) {
+    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
+#define GGML_METAL_DEL_KERNEL(name) \
+    [ctx->function_##name release]; \
+    [ctx->pipeline_##name release];
+
+    GGML_METAL_DEL_KERNEL(add);
+    GGML_METAL_DEL_KERNEL(add_row);
+    GGML_METAL_DEL_KERNEL(mul);
+    GGML_METAL_DEL_KERNEL(mul_row);
+    GGML_METAL_DEL_KERNEL(scale);
+    GGML_METAL_DEL_KERNEL(silu);
+    GGML_METAL_DEL_KERNEL(relu);
+    GGML_METAL_DEL_KERNEL(gelu);
+    GGML_METAL_DEL_KERNEL(soft_max);
+    GGML_METAL_DEL_KERNEL(soft_max_4);
+    GGML_METAL_DEL_KERNEL(diag_mask_inf);
+    GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
+    GGML_METAL_DEL_KERNEL(get_rows_f32);
+    GGML_METAL_DEL_KERNEL(get_rows_f16);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DEL_KERNEL(rms_norm);
+    GGML_METAL_DEL_KERNEL(norm);
+    GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
+    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
+        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+    }
+    GGML_METAL_DEL_KERNEL(rope_f32);
+    GGML_METAL_DEL_KERNEL(rope_f16);
+    GGML_METAL_DEL_KERNEL(alibi_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
+    GGML_METAL_DEL_KERNEL(concat);
+    GGML_METAL_DEL_KERNEL(sqr);
+
+#undef GGML_METAL_DEL_KERNEL
+
+    for (int i = 0; i < ctx->n_buffers; ++i) {
+        [ctx->buffers[i].metal release];
+    }
+
+    [ctx->library release];
+    [ctx->queue release];
+    [ctx->device release];
+
+    dispatch_release(ctx->d_queue);
+
+    free(ctx);
+}
+
+void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    if (result != 0) {
+        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+void ggml_metal_host_free(void * data) {
+    free(data);
+}
+
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
+    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
+}
+
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+    return ctx->concur_list_len;
+}
+
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
+    return ctx->concur_list;
+}
+
+// finds the Metal buffer that contains the tensor data on the GPU device
+// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
+// Metal buffer based on the host memory pointer
+//
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
+    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+
+    const int64_t tsize = ggml_nbytes(t);
+
+    // find the view that contains the tensor fully
+    for (int i = 0; i < ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
+
+        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
+            *offs = (size_t) ioffs;
+
+            //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+
+            return ctx->buffers[i].metal;
+        }
+    }
+
+    GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
+
+    return nil;
+}
+
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                     const char * name,
+                           void * data,
+                         size_t   size,
+                         size_t   max_size) {
+    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
+        GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
+        return false;
+    }
+
+    if (data) {
+        // verify that the buffer does not overlap with any of the existing buffers
+        for (int i = 0; i < ctx->n_buffers; ++i) {
+            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
+
+            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
+                GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
+                return false;
+            }
+        }
+
+        const size_t size_page = sysconf(_SC_PAGESIZE);
+
+        size_t size_aligned = size;
+        if ((size_aligned % size_page) != 0) {
+            size_aligned += (size_page - (size_aligned % size_page));
+        }
+
+        // the buffer fits into the max buffer size allowed by the device
+        if (size_aligned <= ctx->device.maxBufferLength) {
+            ctx->buffers[ctx->n_buffers].name = name;
+            ctx->buffers[ctx->n_buffers].data = data;
+            ctx->buffers[ctx->n_buffers].size = size;
+
+            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                return false;
+            }
+
+            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+
+            ++ctx->n_buffers;
+        } else {
+            // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+            // one of the views
+            const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+            const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
+            const size_t size_view = ctx->device.maxBufferLength;
+
+            for (size_t i = 0; i < size; i += size_step) {
+                const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+                ctx->buffers[ctx->n_buffers].name = name;
+                ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
+                ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+
+                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+                if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    return false;
+                }
+
+                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                if (i + size_step < size) {
+                    GGML_METAL_LOG_INFO("\n");
+                }
+
+                ++ctx->n_buffers;
+            }
+        }
+
+#if TARGET_OS_OSX
+        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
+                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
+            GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
+        } else {
+            GGML_METAL_LOG_INFO("\n");
+        }
+#else
+        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
+#endif
+    }
+
+    return true;
+}
+
+void ggml_metal_set_tensor(
+        struct ggml_metal_context * ctx,
+        struct ggml_tensor * t) {
+    size_t offs;
+    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
+
+    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
+}
+
+void ggml_metal_get_tensor(
+        struct ggml_metal_context * ctx,
+        struct ggml_tensor * t) {
+    size_t offs;
+    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
+
+    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
+}
+
+void ggml_metal_graph_find_concurrency(
+        struct ggml_metal_context * ctx,
+        struct ggml_cgraph * gf, bool check_mem) {
+    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
+    int nodes_unused[GGML_MAX_CONCUR];
+
+    for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
+    for (int i = 0; i < gf->n_nodes;     i++) { nodes_unused[i]     = 1; }
+    ctx->concur_list_len = 0;
+
+    int n_left    = gf->n_nodes;
+    int n_start   = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
+    int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
+
+    while (n_left > 0) {
+        // number of nodes at a layer (that can be issued concurrently)
+        int concurrency = 0;
+        for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
+            if (nodes_unused[i]) {
+                // if the requirements for gf->nodes[i] are satisfied
+                int exe_flag = 1;
+
+                // scan all srcs
+                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
+                    struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
+                    if (src_cur) {
+                        // if is leaf nodes it's satisfied.
+                        // TODO: ggml_is_leaf()
+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
+                            continue;
+                        }
+
+                        // otherwise this src should be the output from previous nodes.
+                        int is_found = 0;
+
+                        // scan 2*search_depth back because we inserted barrier.
+                        //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+                        for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
+                            if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
+                                is_found = 1;
+                                break;
+                            }
+                        }
+                        if (is_found == 0) {
+                            exe_flag = 0;
+                            break;
+                        }
+                    }
+                }
+                if (exe_flag && check_mem) {
+                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
+                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
+                    int64_t data_start = (int64_t) gf->nodes[i]->data;
+                    int64_t length     = (int64_t) ggml_nbytes(gf->nodes[i]);
+                    for (int j = n_start; j < i; j++) {
+                        if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
+                                            && gf->nodes[j]->op != GGML_OP_VIEW \
+                                            && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
+                                            && gf->nodes[j]->op != GGML_OP_PERMUTE) {
+                            if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
+                                ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
+                                continue;
+                            }
+
+                            exe_flag = 0;
+                        }
+                    }
+                }
+                if (exe_flag) {
+                    ctx->concur_list[level_pos + concurrency] = i;
+                    nodes_unused[i] = 0;
+                    concurrency++;
+                    ctx->concur_list_len++;
+                }
+            }
+        }
+        n_left -= concurrency;
+        // adding a barrier different layer
+        ctx->concur_list[level_pos + concurrency] = -1;
+        ctx->concur_list_len++;
+        // jump all sorted nodes at nodes_bak
+        while (!nodes_unused[n_start]) {
+            n_start++;
+        }
+        level_pos += concurrency + 1;
+    }
+
+    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
+        GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
+    }
+}
+
+void ggml_metal_graph_compute(
+        struct ggml_metal_context * ctx,
+               struct ggml_cgraph * gf) {
+    @autoreleasepool {
+
+    // if there is ctx->concur_list, dispatch concurrently
+    // else fallback to serial dispatch
+    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
+
+    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
+
+    const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
+    edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
+
+    // create multiple command buffers and enqueue them
+    // then, we encode the graph into the command buffers in parallel
+
+    const int n_cb = ctx->n_cb;
+
+    for (int i = 0; i < n_cb; ++i) {
+        ctx->command_buffers[i] = [ctx->queue commandBuffer];
+
+        // enqueue the command buffers in order to specify their execution order
+        [ctx->command_buffers[i] enqueue];
+
+        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
+    }
+
+    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
+
+        dispatch_async(ctx->d_queue, ^{
+            size_t offs_src0 = 0;
+            size_t offs_src1 = 0;
+            size_t offs_dst  = 0;
+
+            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
+
+            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
+            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
+
+            for (int ind = node_start; ind < node_end; ++ind) {
+                const int i = has_concur ? ctx->concur_list[ind] : ind;
+
+                if (i == -1) {
+                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+                    continue;
+                }
+
+                //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+                struct ggml_tensor * dst  = gf->nodes[i];
+
+                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+                const int64_t  ne03 = src0 ? src0->ne[3] : 0;
+
+                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+
+                const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+                const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+                const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+                const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+                const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+                const int64_t  ne0  = dst ? dst->ne[0] : 0;
+                const int64_t  ne1  = dst ? dst->ne[1] : 0;
+                const int64_t  ne2  = dst ? dst->ne[2] : 0;
+                const int64_t  ne3  = dst ? dst->ne[3] : 0;
+
+                const uint64_t nb0  = dst ? dst->nb[0] : 0;
+                const uint64_t nb1  = dst ? dst->nb[1] : 0;
+                const uint64_t nb2  = dst ? dst->nb[2] : 0;
+                const uint64_t nb3  = dst ? dst->nb[3] : 0;
+
+                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+                const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+
+                id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+
+                //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+                //if (src0) {
+                //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+                //            ggml_is_contiguous(src0), src0->name);
+                //}
+                //if (src1) {
+                //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+                //            ggml_is_contiguous(src1), src1->name);
+                //}
+                //if (dst) {
+                //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+                //            dst->name);
+                //}
+
+                switch (dst->op) {
+                    case GGML_OP_NONE:
+                    case GGML_OP_RESHAPE:
+                    case GGML_OP_VIEW:
+                    case GGML_OP_TRANSPOSE:
+                    case GGML_OP_PERMUTE:
+                        {
+                            // noop
+                        } break;
+                    case GGML_OP_CONCAT:
+                        {
+
+                            int64_t nb = ne00;
+                            [encoder setComputePipelineState:ctx->pipeline_concat];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
+
+                            const int nth = MIN(1024, ne0);
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_ADD:
+                        {
+                            GGML_ASSERT(ggml_is_contiguous(src0));
+                            GGML_ASSERT(ggml_is_contiguous(src1));
+
+                            bool bcast_row = false;
+
+                            int64_t nb = ne00;
+
+                            if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
+                                // src1 is a row
+                                GGML_ASSERT(ne11 == 1);
+
+                                nb = ne00 / 4;
+                                [encoder setComputePipelineState:ctx->pipeline_add_row];
+
+                                bcast_row = true;
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_add];
+                            }
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
+
+                            if (bcast_row) {
+                                const int64_t n = ggml_nelements(dst)/4;
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } else {
+                                const int nth = MIN(1024, ne0);
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            }
+                        } break;
+                    case GGML_OP_MUL:
+                        {
+                            GGML_ASSERT(ggml_is_contiguous(src0));
+                            GGML_ASSERT(ggml_is_contiguous(src1));
+
+                            // utilize float4
+                            GGML_ASSERT(ne00 % 4 == 0);
+                            const int64_t nb = ne00/4;
+
+                            if (ggml_nelements(src1) == ne10) {
+                                // src1 is a row
+                                GGML_ASSERT(ne11 == 1);
+                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_mul];
+                            }
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
+
+                            const int64_t n = ggml_nelements(dst)/4;
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_SCALE:
+                        {
+                            GGML_ASSERT(ggml_is_contiguous(src0));
+
+                            const float scale = *(const float *) src1->data;
+
+                            [encoder setComputePipelineState:ctx->pipeline_scale];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+
+                            const int64_t n = ggml_nelements(dst)/4;
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_UNARY:
+                        switch (ggml_get_unary_op(gf->nodes[i])) {
+                            case GGML_UNARY_OP_SILU:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst)/4;
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_RELU:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_relu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_GELU:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst)/4;
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            default:
+                                {
+                                    GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    GGML_ASSERT(false);
+                                }
+                        } break;
+                    case GGML_OP_SQR:
+                        {
+                            GGML_ASSERT(ggml_is_contiguous(src0));
+
+                            [encoder setComputePipelineState:ctx->pipeline_sqr];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                            const int64_t n = ggml_nelements(dst);
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_SOFT_MAX:
+                        {
+                            const int nth = MIN(32, ne00);
+
+                            if (ne00%4 == 0) {
+                                [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_soft_max];
+                            }
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_DIAG_MASK_INF:
+                        {
+                            const int n_past = ((int32_t *)(dst->op_params))[0];
+
+                            if (ne00%8 == 0) {
+                                [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
+                            }
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+
+                            if (ne00%8 == 0) {
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            }
+                            else {
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            }
+                        } break;
+                    case GGML_OP_MUL_MAT:
+                        {
+                            GGML_ASSERT(ne00 == ne10);
+                            GGML_ASSERT(ne03 == ne13);
+
+                            const uint gqa = ne12/ne02;
+
+                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                            // to the matrix-vector kernel
+                            int ne11_mm_min = 1;
+
+#if 0
+                            // the numbers below are measured on M2 Ultra for 7B and 13B models
+                            // these numbers do not translate to other devices or model sizes
+                            // TODO: need to find a better approach
+                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
+                                switch (src0t) {
+                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
+                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
+                                    case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+                                    case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
+                                    case GGML_TYPE_Q4_0:
+                                    case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+                                    case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+                                    case GGML_TYPE_Q5_0:                          // not tested yet
+                                    case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+                                    case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
+                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
+                                    default:             ne11_mm_min = 1;  break;
+                                }
+                            }
+#endif
+
+                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                !ggml_is_transposed(src0) &&
+                                !ggml_is_transposed(src1) &&
+                                src1t == GGML_TYPE_F32 &&
+                                ne00 % 32 == 0 &&
+                                ne11 > ne11_mm_min) {
+                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+                                switch (src0->type) {
+                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
+                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
+                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
+                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+                                    case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
+                                    case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
+                                    case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
+                                    case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
+                                    case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
+                                    case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
+                                    default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+                                }
+                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
+                                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
+                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
+                                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
+                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
+                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
+                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
+                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
+                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
+                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
+                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:13];
+                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                            } else {
+                                int nth0 = 32;
+                                int nth1 = 1;
+                                int nrows = 1;
+                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                                // use custom matrix x vector kernel
+                                switch (src0t) {
+                                    case GGML_TYPE_F32:
+                                        {
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
+                                            nrows = 4;
+                                        } break;
+                                    case GGML_TYPE_F16:
+                                        {
+                                            nth0 = 32;
+                                            nth1 = 1;
+                                            if (ne11 * ne12 < 4) {
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
+                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
+                                                nrows = ne11;
+                                            } else {
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
+                                                nrows = 4;
+                                            }
+                                        } break;
+                                    case GGML_TYPE_Q4_0:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_1:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
+                                        } break;
+                                    case GGML_TYPE_Q8_0:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
+                                        } break;
+                                    case GGML_TYPE_Q2_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
+                                        } break;
+                                    case GGML_TYPE_Q3_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 4; //1;
+                                            nth1 = 8; //32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
+                                        } break;
+                                    case GGML_TYPE_Q5_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
+                                        } break;
+                                    case GGML_TYPE_Q6_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
+                                        } break;
+                                    default:
+                                        {
+                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                            GGML_ASSERT(false && "not implemented");
+                                        }
+                                };
+
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
+                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
+                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
+                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
+
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
+                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q4_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q3_K) {
+#ifdef GGML_QKK_64
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#else
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#endif
+                                }
+                                else if (src0t == GGML_TYPE_Q5_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q6_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                } else {
+                                    int64_t ny = (ne11 + nrows - 1)/nrows;
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                            }
+                        } break;
+                    case GGML_OP_GET_ROWS:
+                        {
+                            switch (src0->type) {
+                                case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f32];  break;
+                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
+                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
+                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
+                                case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
+                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
+                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
+                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
+                                case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
+                                case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            }
+
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
+                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:5];
+
+                            const int64_t n = ggml_nelements(src1);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_RMS_NORM:
+                        {
+                            float eps;
+                            memcpy(&eps, dst->op_params, sizeof(float));
+
+                            const int nth = MIN(512, ne00);
+
+                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
+
+                            const int64_t nrows = ggml_nrows(src0);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_NORM:
+                        {
+                            float eps;
+                            memcpy(&eps, dst->op_params, sizeof(float));
+
+                            const int nth = MIN(256, ne00);
+
+                            [encoder setComputePipelineState:ctx->pipeline_norm];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+
+                            const int64_t nrows = ggml_nrows(src0);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_ALIBI:
+                        {
+                            GGML_ASSERT((src0t == GGML_TYPE_F32));
+
+                            const int nth = MIN(1024, ne00);
+
+                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+                            const int n_head = ((int32_t *) dst->op_params)[1];
+                            float max_bias;
+                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+                            const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+                            const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+                            const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+                            [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
+                            [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
+                            [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_ROPE:
+                        {
+                            GGML_ASSERT(ne10 == ne02);
+
+                            const int nth = MIN(1024, ne00);
+
+                            const int n_past = ((int32_t *) dst->op_params)[0];
+                            const int n_dims = ((int32_t *) dst->op_params)[1];
+                            const int mode   = ((int32_t *) dst->op_params)[2];
+
+                            float freq_base;
+                            float freq_scale;
+                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
+                                case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break;
+                                default: GGML_ASSERT(false);
+                            };
+
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:6];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:14];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:18];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:21];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_DUP:
+                    case GGML_OP_CPY:
+                    case GGML_OP_CONT:
+                        {
+                            const int nth = MIN(1024, ne00);
+
+                            switch (src0t) {
+                                case GGML_TYPE_F32:
+                                    {
+                                        switch (dstt) {
+                                            case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
+                                            case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
+                                            default: GGML_ASSERT(false && "not implemented");
+                                        };
+                                    } break;
+                                case GGML_TYPE_F16:
+                                    {
+                                        switch (dstt) {
+                                            case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
+                                            case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
+                                            default: GGML_ASSERT(false && "not implemented");
+                                        };
+                                    } break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            }
+
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    default:
+                        {
+                            GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            GGML_ASSERT(false);
+                        }
+                }
+            }
+
+            if (encoder != nil) {
+                [encoder endEncoding];
+                encoder = nil;
+            }
+
+            [command_buffer commit];
+        });
+    }
+
+    // wait for all threads to finish
+    dispatch_barrier_sync(ctx->d_queue, ^{});
+
+    // check status of command buffers
+    // needed to detect if the device ran out-of-memory for example (#1881)
+    for (int i = 0; i < n_cb; i++) {
+        [ctx->command_buffers[i] waitUntilCompleted];
+
+        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
+        if (status != MTLCommandBufferStatusCompleted) {
+            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+            GGML_ASSERT(false);
+        }
+    }
+
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+static const char * ggml_backend_metal_name(ggml_backend_t backend) {
+    return "Metal";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_free(ggml_backend_t backend) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    ggml_metal_free(ctx);
+    free(backend);
+}
+
+static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)buffer->context;
+}
+
+static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i metal_backend_buffer_i = {
+    /* .free_buffer    = */ ggml_backend_metal_buffer_free_buffer,
+    /* .get_base       = */ ggml_backend_metal_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL, // no initialization required
+    /* .free_tensor    = */ NULL, // no cleanup required
+};
+
+static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+    void * data = ggml_metal_host_malloc(size);
+
+    // TODO: set proper name of the buffers
+    ggml_metal_add_buffer(ctx, "backend", data, size, 0);
+
+    return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
+}
+
+static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
+    return 32;
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+
+    ggml_metal_graph_compute(metal_ctx, cgraph);
+}
+
+static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return true;
+    UNUSED(backend);
+    UNUSED(op);
+}
+
+static struct ggml_backend_i metal_backend_i = {
+    /* .get_name            = */ ggml_backend_metal_name,
+    /* .free                = */ ggml_backend_metal_free,
+    /* .alloc_buffer        = */ ggml_backend_metal_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_metal_get_alignment,
+    /* .set_tensor_async    = */ ggml_backend_metal_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_metal_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_metal_synchronize,
+    /* .cpy_tensor_from     = */ ggml_backend_metal_cpy_tensor_from,
+    /* .cpy_tensor_to       = */ ggml_backend_metal_cpy_tensor_to,
+    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
+    /* .graph_plan_free     = */ NULL,
+    /* .graph_plan_compute  = */ NULL,
+    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
+    /* .supports_op         = */ ggml_backend_metal_supports_op,
+};
+
+ggml_backend_t ggml_backend_metal_init(void) {
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+
+    ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
+
+    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
+
+    *metal_backend = (struct ggml_backend) {
+        /* .interface = */ metal_backend_i,
+        /* .context   = */ ctx,
+    };
+
+    return metal_backend;
+}
+
+bool ggml_backend_is_metal(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_metal_name;
+}
+
+void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+    ggml_metal_set_n_cb(ctx, n_cb);
+}
diff --git a/stable-diffusion.cpp/ggml/src/ggml-metal.metal b/stable-diffusion.cpp/ggml/src/ggml-metal.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b6288db28660dc9e8902cb7a27369e9af30db695
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-metal.metal
@@ -0,0 +1,2520 @@
+#include <metal_stdlib>
+
+using namespace metal;
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+
+#define QK4_0 32
+#define QR4_0 2
+typedef struct {
+    half    d;             // delta
+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+
+#define QK4_1 32
+typedef struct {
+    half d;                 // delta
+    half m;                 // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+
+#define QK8_0 32
+typedef struct {
+    half    d;         // delta
+    int8_t  qs[QK8_0]; // quants
+} block_q8_0;
+
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
+kernel void kernel_add(
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        constant  int64_t & ne00,
+        constant  int64_t & ne01,
+        constant  int64_t & ne02,
+        constant  int64_t & ne03,
+        constant  int64_t & nb00,
+        constant  int64_t & nb01,
+        constant  int64_t & nb02,
+        constant  int64_t & nb03,
+        constant  int64_t & ne10,
+        constant  int64_t & ne11,
+        constant  int64_t & ne12,
+        constant  int64_t & ne13,
+        constant  int64_t & nb10,
+        constant  int64_t & nb11,
+        constant  int64_t & nb12,
+        constant  int64_t & nb13,
+        constant  int64_t & ne0,
+        constant  int64_t & ne1,
+        constant  int64_t & ne2,
+        constant  int64_t & ne3,
+        constant  int64_t & nb0,
+        constant  int64_t & nb1,
+        constant  int64_t & nb2,
+        constant  int64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig.z;
+    const int64_t i02 = tgpig.y;
+    const int64_t i01 = tgpig.x;
+
+    const int64_t i13 = i03 % ne13;
+    const int64_t i12 = i02 % ne12;
+    const int64_t i11 = i01 % ne11;
+
+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
+
+        src0_ptr += ntg.x*nb00;
+        src1_ptr += ntg.x*nb10;
+        dst_ptr  += ntg.x*nb0;
+    }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_add_row(
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        constant    int64_t & nb [[buffer(27)]],
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] + src1[tpig % nb];
+}
+
+kernel void kernel_mul(
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * src1[tpig];
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_mul_row(
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        constant    int64_t & nb,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * src1[tpig % nb];
+}
+
+kernel void kernel_scale(
+        device const float4 * src0,
+        device       float4 * dst,
+        constant     float & scale,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * scale;
+}
+
+kernel void kernel_silu(
+        device const float4 * src0,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+    dst[tpig] = x / (1.0f + exp(-x));
+}
+
+kernel void kernel_relu(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = max(0.0f, src0[tpig]);
+}
+
+kernel void kernel_sqr(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * src0[tpig];
+}
+
+constant float GELU_COEF_A    = 0.044715f;
+constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+
+kernel void kernel_gelu(
+    device const float4 * src0,
+    device       float4 * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+
+    // BEWARE !!!
+    // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
+    // This was observed with Falcon 7B and 40B models
+    //
+    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+kernel void kernel_soft_max(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    // parallel max
+    float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
+    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
+        lmax = MAX(lmax, psrc0[i00]);
+    }
+    const float max = simd_max(lmax);
+
+    // parallel sum
+    float lsum = 0.0f;
+    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+        const float exp_psrc0 = exp(psrc0[i00] - max);
+        lsum += exp_psrc0;
+        // Remember the result of exp here. exp is expensive, so we really do not
+        // whish to compute it twice.
+        pdst[i00] = exp_psrc0;
+    }
+
+    const float sum = simd_sum(lsum);
+
+    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+        pdst[i00] /= sum;
+    }
+}
+
+kernel void kernel_soft_max_4(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    device       float4 * pdst4 = (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+
+    // parallel max
+    float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
+    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
+        lmax4 = fmax(lmax4, psrc4[i00]);
+    }
+    float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
+
+    const float max = simd_max(lmax);
+
+    // parallel sum
+    float4 lsum4 = 0.0f;
+    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
+        const float4 exp_psrc4 = exp(psrc4[i00] - max);
+        lsum4 += exp_psrc4;
+        pdst4[i00] = exp_psrc4;
+    }
+    float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
+
+    const float sum = simd_sum(lsum);
+
+    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
+        pdst4[i00] /= sum;
+    }
+}
+
+kernel void kernel_diag_mask_inf(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant       int & n_past,
+        uint3 tpig[[thread_position_in_grid]]) {
+    const int64_t i02 = tpig[2];
+    const int64_t i01 = tpig[1];
+    const int64_t i00 = tpig[0];
+
+    if (i00 > n_past + i01) {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
+    } else {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
+     }
+}
+
+kernel void kernel_diag_mask_inf_8(
+        device const float4 * src0,
+        device       float4 * dst,
+        constant    int64_t & ne00,
+        constant    int64_t & ne01,
+        constant        int & n_past,
+        uint3 tpig[[thread_position_in_grid]]) {
+
+    const int64_t i = 2*tpig[0];
+
+    dst[i+0] = src0[i+0];
+    dst[i+1] = src0[i+1];
+    int64_t i4 = 4*i;
+    const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
+    const int64_t i01 = i4/(ne00);      i4 -= i01*ne00;
+    const int64_t i00 = i4;
+    for (int k = 3; k >= 0; --k) {
+        if (i00 + 4 + k <= n_past + i01) {
+            break;
+        }
+        dst[i+1][k] = -INFINITY;
+        if (i00 + k > n_past + i01) {
+            dst[i][k] = -INFINITY;
+        }
+    }
+}
+
+kernel void kernel_norm(
+        device const  void * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant     float & eps,
+        threadgroup float  * sum [[threadgroup(0)]],
+        uint tgpig[[threadgroup_position_in_grid]],
+        uint tpitg[[thread_position_in_threadgroup]],
+        uint   ntg[[threads_per_threadgroup]]) {
+    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
+    // MEAN
+    // parallel sum
+    sum[tpitg] = 0.0f;
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        sum[tpitg] += x[i00];
+    }
+    // reduce
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (uint i = ntg/2; i > 0; i /= 2) {
+        if (tpitg < i) {
+            sum[tpitg] += sum[tpitg + i];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    const float mean  = sum[0] / ne00;
+
+    // recenter and VARIANCE
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    device float * y = dst + tgpig*ne00;
+    sum[tpitg] = 0.0f;
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        y[i00] = x[i00] - mean;
+        sum[tpitg] += y[i00] * y[i00];
+    }
+
+    // reduce
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (uint i = ntg/2; i > 0; i /= 2) {
+        if (tpitg < i) {
+            sum[tpitg] += sum[tpitg + i];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    const float variance = sum[0] / ne00;
+
+    const float scale = 1.0f/sqrt(variance + eps);
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        y[i00] = y[i00] * scale;
+    }
+}
+
+kernel void kernel_rms_norm(
+        device const  void * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant     float & eps,
+        threadgroup float  * sum [[threadgroup(0)]],
+        uint tgpig[[threadgroup_position_in_grid]],
+        uint tpitg[[thread_position_in_threadgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint   ntg[[threads_per_threadgroup]]) {
+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+    device const float * x_scalar = (device const float *) x;
+    float4 sumf=0;
+    float all_sum=0;
+
+    // parallel sum
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        sumf += x[i00] * x[i00];
+    }
+    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
+    all_sum = simd_sum(all_sum);
+    if (tiisg == 0) {
+        sum[sgitg] = all_sum;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // broadcast, simd group number is ntg / 32
+    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
+       if (tpitg < i) {
+           sum[tpitg] += sum[tpitg + i];
+       }
+    }
+    if (tpitg == 0) {
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
+        sum[0] /= ne00;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    const float mean  = sum[0];
+    const float scale = 1.0f/sqrt(mean + eps);
+
+    device float4 * y = (device float4 *) (dst + tgpig*ne00);
+    device float * y_scalar = (device float *) y;
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        y[i00] = x[i00] * scale;
+    }
+    if (tpitg == 0) {
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
+    }
+}
+
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+    float2 acc = 0.f;
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
+    }
+    return d * (sumy * -8.f + acc[0] + acc[1]);
+}
+
+// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+    float2 acc = 0.f;
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
+    }
+    return d * (acc[0] + acc[1]) + sumy * m;
+}
+
+// putting them in the kernel cause a significant performance penalty
+#define N_DST 4        // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+//Note: This is a template, but strictly speaking it only applies to
+//      quantizations where the block size is 32. It also does not
+//      giard against the number of rows not being divisible by
+//      N_DST, so this is another explicit assumption of the implementation.
+template<typename block_q_type, int nr, int nsg, int nw>
+void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
+                    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
+                    uint3 tgpig, uint tiisg, uint sgitg) {
+    const int nb = ne00/QK4_0;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * nsg + sgitg) * nr;
+
+    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
+
+    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
+    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float yl[16]; // src1 vector cache
+    float sumf[nr] = {0.f};
+
+    const int ix = (tiisg/2);
+    const int il = (tiisg%2)*8;
+
+    device const float * yb = y + ix * QK4_0 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += nw/2) {
+        float sumy = 0;
+        for (int i = 0; i < 8; i += 2) {
+            sumy += yb[i] + yb[i+1];
+            yl[i+0] = yb[i+ 0];
+            yl[i+1] = yb[i+ 1]/256.f;
+
+            sumy += yb[i+16] + yb[i+17];
+            yl[i+8] = yb[i+16]/16.f;
+            yl[i+9] = yb[i+17]/4096.f;
+        }
+
+        for (int row = 0; row < nr; row++) {
+            sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
+        }
+
+        yb += QK4_0 * 16;
+    }
+
+    for (int row = 0; row < nr; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        if (tiisg == 0 && first_row + row < ne01) {
+            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
+        }
+    }
+}
+
+kernel void kernel_mul_mv_q4_0_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+}
+
+kernel void kernel_mul_mv_q4_1_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+}
+
+#define NB_Q8_0 8
+
+kernel void kernel_mul_mv_q8_0_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+    const int nr  = N_DST;
+    const int nsg = N_SIMDGROUP;
+    const int nw  = N_SIMDWIDTH;
+
+    const int nb = ne00/QK8_0;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+    const int first_row = (r0 * nsg + sgitg) * nr;
+    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
+    device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float yl[NB_Q8_0];
+    float sumf[nr]={0.f};
+
+    const int ix = tiisg/4;
+    const int il = tiisg%4;
+
+    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
+
+    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
+    for (int ib = ix; ib < nb; ib += nw/4) {
+        for (int i = 0; i < NB_Q8_0; ++i) {
+            yl[i] = yb[i];
+        }
+
+        for (int row = 0; row < nr; row++) {
+            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
+            float sumq = 0.f;
+            for (int iq = 0; iq < NB_Q8_0; ++iq) {
+                sumq += qs[iq] * yl[iq];
+            }
+            sumf[row] += sumq*x[ib+row*nb].d;
+        }
+
+        yb += NB_Q8_0 * nw;
+    }
+
+    for (int row = 0; row < nr; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        if (tiisg == 0 && first_row + row < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
+        }
+    }
+}
+
+#define N_F32_F32 4
+
+kernel void kernel_mul_mv_f32_f32(
+        device const  char * src0,
+        device const  char * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]]) {
+
+    const int64_t r0 = tgpig.x;
+    const int64_t rb = tgpig.y*N_F32_F32;
+    const int64_t im = tgpig.z;
+
+    device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F32_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00; i += 32) {
+                sumf += (float) x[i] * (float) y[i];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        device const float4 * x4 = (device const float4 *)x;
+        for (int row = 0; row < N_F32_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
+            device const float4 * y4 = (device const float4 *) y;
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00/4; i += 32) {
+                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+kernel void kernel_mul_mv_f16_f32_1row(
+        device const  char * src0,
+        device const  char * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]]) {
+
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+    const int64_t im = tgpig.z;
+
+    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
+
+    float sumf = 0;
+    if (ne00 < 128) {
+        for (int i = tiisg; i < ne00; i += 32) {
+            sumf += (float) x[i] * (float) y[i];
+        }
+        float all_sum = simd_sum(sumf);
+        if (tiisg == 0) {
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    } else {
+        device const half4  * x4 = (device const half4  *) x;
+        device const float4 * y4 = (device const float4 *) y;
+        for (int i = tiisg; i < ne00/4; i += 32) {
+            for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
+        }
+        float all_sum = simd_sum(sumf);
+        if (tiisg == 0) {
+            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    }
+
+}
+
+#define N_F16_F32 4
+
+kernel void kernel_mul_mv_f16_f32(
+        device const  char * src0,
+        device const  char * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]]) {
+
+    const int64_t r0 = tgpig.x;
+    const int64_t rb = tgpig.y*N_F16_F32;
+    const int64_t im = tgpig.z;
+
+    device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00; i += 32) {
+                sumf += (float) x[i] * (float) y[i];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        device const half4 * x4 = (device const half4 *)x;
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
+            device const float4 * y4 = (device const float4 *) y;
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00/4; i += 32) {
+                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+// Assumes row size (ne00) is a multiple of 4
+kernel void kernel_mul_mv_f16_f32_l4(
+        device const  char * src0,
+        device const  char * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]]) {
+
+    const int nrows = ne11;
+    const int64_t r0 = tgpig.x;
+    const int64_t im = tgpig.z;
+
+    device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+
+    for (int r1 = 0; r1 < nrows; ++r1) {
+        device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
+
+        float sumf = 0;
+        for (int i = tiisg; i < ne00/4; i += 32) {
+            for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
+        }
+
+        float all_sum = simd_sum(sumf);
+        if (tiisg == 0) {
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    }
+}
+
+kernel void kernel_alibi_f32(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        constant     float & m0,
+        constant     float & m1,
+        constant       int & n_heads_log2_floor,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+    float m_k;
+    if (i2 < n_heads_log2_floor) {
+        m_k = pow(m0, i2 + 1);
+    } else {
+        m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
+    }
+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
+    }
+}
+
+typedef void (rope_t)(
+        device const    void * src0,
+        device const int32_t * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant     int64_t & ne03,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant    uint64_t & nb03,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant     int64_t & ne2,
+        constant     int64_t & ne3,
+        constant    uint64_t & nb0,
+        constant    uint64_t & nb1,
+        constant    uint64_t & nb2,
+        constant    uint64_t & nb3,
+        constant         int & n_past,
+        constant         int & n_dims,
+        constant         int & mode,
+        constant       float & freq_base,
+        constant       float & freq_scale,
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]);
+
+template<typename T>
+kernel void kernel_rope(
+        device const    void * src0,
+        device const int32_t * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant     int64_t & ne03,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant    uint64_t & nb03,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant     int64_t & ne2,
+        constant     int64_t & ne3,
+        constant    uint64_t & nb0,
+        constant    uint64_t & nb1,
+        constant    uint64_t & nb2,
+        constant    uint64_t & nb3,
+        constant         int & n_past,
+        constant         int & n_dims,
+        constant         int & mode,
+        constant       float & freq_base,
+        constant       float & freq_scale,
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]) {
+    const int64_t i3 = tgpig[2];
+    const int64_t i2 = tgpig[1];
+    const int64_t i1 = tgpig[0];
+
+    const bool is_neox = mode & 2;
+
+    device const int32_t * pos = src1;
+
+    const int64_t p = pos[i2];
+
+    const float theta_0 = freq_scale * (float)p;
+    const float inv_ndims = -1.f/n_dims;
+
+    if (!is_neox) {
+        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+
+            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);
+
+            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            const T x0 = src[0];
+            const T x1 = src[1];
+
+            dst_data[0] = x0*cos_theta - x1*sin_theta;
+            dst_data[1] = x0*sin_theta + x1*cos_theta;
+        }
+    } else {
+        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+
+                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
+                const float cos_theta = cos(theta);
+                const float sin_theta = sin(theta);
+
+                const int64_t i0 = ib*n_dims + ic/2;
+
+                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                const float x0 = src[0];
+                const float x1 = src[n_dims/2];
+
+                dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+            }
+        }
+    }
+}
+
+template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
+template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
+
+kernel void kernel_cpy_f16_f16(
+        device const half * src0,
+        device       half * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
+        device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f32_f16(
+        device const float * src0,
+        device        half * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f32_f32(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_concat(
+    device const char * src0,
+    device const char * src1,
+    device       char * dst,
+    constant   int64_t & ne00,
+    constant   int64_t & ne01,
+    constant   int64_t & ne02,
+    constant   int64_t & ne03,
+    constant  uint64_t & nb00,
+    constant  uint64_t & nb01,
+    constant  uint64_t & nb02,
+    constant  uint64_t & nb03,
+    constant   int64_t & ne10,
+    constant   int64_t & ne11,
+    constant   int64_t & ne12,
+    constant   int64_t & ne13,
+    constant  uint64_t & nb10,
+    constant  uint64_t & nb11,
+    constant  uint64_t & nb12,
+    constant  uint64_t & nb13,
+    constant   int64_t & ne0,
+    constant   int64_t & ne1,
+    constant   int64_t & ne2,
+    constant   int64_t & ne3,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    constant  uint64_t & nb2,
+    constant  uint64_t & nb3,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i03 = tgpig.z;
+    const int64_t i02 = tgpig.y;
+    const int64_t i01 = tgpig.x;
+
+    const int64_t i13 = i03 % ne13;
+    const int64_t i12 = i02 % ne12;
+    const int64_t i11 = i01 % ne11;
+
+    device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        if (i02 < ne02) {
+            ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
+            src0_ptr += ntg.x*nb00;
+        } else {
+            ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
+            src1_ptr += ntg.x*nb10;
+        }
+        dst_ptr += ntg.x*nb0;
+    }
+}
+
+//============================================ k-quants ======================================================
+
+#ifndef QK_K
+#define QK_K 256
+#else
+static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    half d;           // super-block scale for quantized scales
+    half dmin;        // super-block scale for quantized mins
+} block_q2_K;
+// 84 bytes / block
+
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    half d;             // super-block scale
+} block_q3_K;
+
+#if QK_K == 64
+typedef struct {
+    half    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+typedef struct {
+    half d;             // super-block scale for quantized scales
+    half dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+#endif
+
+#if QK_K == 64
+typedef struct {
+    half  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+#else
+typedef struct {
+    half d;                      // super-block scale for quantized scales
+    half dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+// 176 bytes / block
+#endif
+
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    half d;                  // super-block scale
+} block_q6_K;
+// 210 bytes / block
+
+static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
+    uchar4 r;
+    if (j < 4) {
+        r[0] = q[j+0] & 63;
+        r[2] = q[j+1] & 63;
+        r[1] = q[j+4] & 63;
+        r[3] = q[j+5] & 63;
+    } else {
+        r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4);
+        r[1] = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+        r[3] = (q[j+5] >>  4) | ((q[j+1] >> 6) << 4);
+    }
+    return r;
+}
+
+//====================================== dot products =========================
+
+kernel void kernel_mul_mv_q2_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int nb = ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int r2 = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int ib_row = first_row * nb;
+    const uint offset0 = r2/gqa*(nb*ne0);
+    device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0;
+    device const float      * y = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int step = sizeof(block_q2_K) * nb;
+
+#if QK_K == 256
+    const int ix = tiisg/8;  // 0...3
+    const int it = tiisg%8;  // 0...7
+    const int im = it/4;     // 0 or 1
+    const int ir = it%4;     // 0...3
+    const int is = (8*ir)/16;// 0 or 1
+
+    device const float * y4 = y + ix * QK_K + 128 * im + 8 * ir;
+
+    for (int ib = ix; ib < nb; ib += 4) {
+
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
+            yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8];
+            yl[i+16] = y4[i+64]; sumy[2] += yl[i+16];
+            yl[i+24] = y4[i+96]; sumy[3] += yl[i+24];
+        }
+
+        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales + 8*im + is;
+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
+        device const half     * dh = &x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
+                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
+                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
+                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
+                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
+                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
+                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
+                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
+            }
+            float dall = dh[0];
+            float dmin = dh[1] * 1.f/16.f;
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
+                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f +
+                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f +
+                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
+                         dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
+
+            qs += step/2;
+            sc += step;
+            dh += step/2;
+        }
+
+        y4 += 4 * QK_K;
+    }
+#else
+    const int ix = tiisg/2;  // 0...15
+    const int it = tiisg%2;  // 0...1
+
+    device const float * y4 = y + ix * QK_K + 8 * it;
+
+    for (int ib = ix; ib < nb; ib += 16) {
+
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
+            yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
+            yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
+            yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
+        }
+
+        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales;
+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
+        device const half     * dh = &x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
+                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
+                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
+                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
+                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
+                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
+                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
+                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
+            }
+
+            float dall = dh[0];
+            float dmin = dh[1];
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
+                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
+                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
+                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
+                         dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
+
+            qs += step/2;
+            sc += step;
+            dh += step/2;
+        }
+
+        y4 += 16 * QK_K;
+    }
+#endif
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum;
+        }
+    }
+}
+
+#if QK_K == 256
+kernel void kernel_mul_mv_q3_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int nb = ne00/QK_K;
+
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+    const int64_t r2 = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
+    const uint offset0 = r2/gqa*(nb*ne0);
+    device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+
+    float yl[32];
+
+    //const uint16_t kmask1 = 0x3030;
+    //const uint16_t kmask2 = 0x0f0f;
+
+    const int tid = tiisg/4;
+    const int ix  = tiisg%4;
+    const int ip  = tid/4;          // 0 or 1
+    const int il  = 2*((tid%4)/2);  // 0 or 2
+    const int ir  = tid%2;
+    const int n   = 8;
+    const int l0  = n*ir;
+
+    // One would think that the Metal compiler would figure out that ip and il can only have
+    // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
+    // with these two tales.
+    //
+    // Possible masks for the high bit
+    const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200},  // ip = 0, il = 0
+                           {0x0004, 0x0400, 0x0008, 0x0800},  // ip = 0, il = 2
+                           {0x0010, 0x1000, 0x0020, 0x2000},  // ip = 1, il = 0
+                           {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
+
+    // Possible masks for the low 2 bits
+    const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
+
+    const ushort4 hm = mm[2*ip + il/2];
+
+    const int shift = 2*il;
+    const float    v1 = il == 0 ? 4.f : 64.f;
+    const float    v2 = 4.f * v1;
+
+    const uint16_t s_shift1 = 4*ip;
+    const uint16_t s_shift2 = s_shift1 + il;
+
+    const int q_offset = 32*ip + l0;
+    const int y_offset = 128*ip + 32*il + l0;
+
+    const int step = sizeof(block_q3_K) * nb / 2;
+
+    device const float * y1 = yy + ix*QK_K + y_offset;
+
+    uint32_t scales32, aux32;
+    thread uint16_t * scales16 = (thread uint16_t *)&scales32;
+    thread const int8_t * scales = (thread const int8_t *)&scales32;
+
+    float sumf1[2] = {0.f};
+    float sumf2[2] = {0.f};
+    for (int i = ix; i < nb; i += 4) {
+
+        for (int l = 0; l < 8; ++l) {
+            yl[l+ 0] = y1[l+ 0];
+            yl[l+ 8] = y1[l+16];
+            yl[l+16] = y1[l+32];
+            yl[l+24] = y1[l+48];
+        }
+
+        device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
+        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0);
+        device const uint16_t * a = (device const uint16_t *)(x[i].scales);
+        device const half * dh = &x[i].d;
+
+        for (int row = 0; row < 2; ++row) {
+
+            const float d_all = (float)dh[0];
+
+            scales16[0] = a[4];
+            scales16[1] = a[5];
+            aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
+            scales16[0] = a[il+0];
+            scales16[1] = a[il+1];
+            scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
+
+            float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
+            for (int l = 0; l < n; l += 2) {
+                const int32_t qs = q[l/2];
+                s1 += yl[l+0] * (qs & qm[il/2][0]);
+                s2 += yl[l+1] * (qs & qm[il/2][1]);
+                s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
+                s4 += yl[l+16] * (qs & qm[il/2][2]);
+                s5 += yl[l+17] * (qs & qm[il/2][3]);
+                s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
+            }
+            float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
+            float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
+            sumf1[row] += d1 * (scales[0] - 32);
+            sumf2[row] += d2 * (scales[2] - 32);
+
+            s1 = s2 = s3 = s4 = s5 = s6 = 0;
+            for (int l = 0; l < n; l += 2) {
+                const int32_t qs = q[l/2+8];
+                s1 += yl[l+8] * (qs & qm[il/2][0]);
+                s2 += yl[l+9] * (qs & qm[il/2][1]);
+                s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
+                s4 += yl[l+24] * (qs & qm[il/2][2]);
+                s5 += yl[l+25] * (qs & qm[il/2][3]);
+                s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
+            }
+            d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
+            d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
+            sumf1[row] += d1 * (scales[1] - 32);
+            sumf2[row] += d2 * (scales[3] - 32);
+
+            q  += step;
+            h  += step;
+            a  += step;
+            dh += step;
+
+        }
+
+        y1 += 4 * QK_K;
+
+    }
+
+    for (int row = 0; row < 2; ++row) {
+        const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
+        sumf1[row] = simd_sum(sumf);
+    }
+    if (tiisg == 0) {
+        for (int row = 0; row < 2; ++row) {
+            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
+        }
+    }
+}
+#else
+kernel void kernel_mul_mv_q3_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int nb = ne00/QK_K;
+
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+    const int64_t r2 = tgpig.z;
+
+    const int row = 2 * r0 + sgitg;
+    const uint offset0 = r2/gqa*(nb*ne0);
+    device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    const int ix = tiisg/4;
+    const int il = 4 * (tiisg%4);// 0, 4, 8, 12
+    const int im = il/8;         // 0, 0, 1, 1
+    const int in = il%8;         // 0, 4, 0, 4
+
+    float2 sum = {0.f, 0.f};
+
+    for (int i = ix; i < nb; i += 8) {
+
+        const float d_all = (float)(x[i].d);
+
+        device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
+        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
+        device const uint16_t * s = (device const uint16_t *)(x[i].scales);
+        device const float    * y = yy + i * QK_K + il;
+
+        const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
+        const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
+        const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
+        const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
+
+        for (int l = 0; l < 4; l += 2) {
+            const uint16_t hm = h[l/2] >> im;
+            sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
+                    + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
+                    + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
+                    + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
+            sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
+                    + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
+                    + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
+                    + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
+        }
+
+    }
+    const float sumf = sum[0] + sum[1] * 1.f/256.f;
+
+    const float tot = simd_sum(sumf);
+    if (tiisg == 0) {
+        dst[r1*ne0 + r2*ne0*ne1 + row] = tot;
+    }
+
+}
+#endif
+
+#if QK_K == 256
+kernel void kernel_mul_mv_q4_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01 [[buffer(4)]],
+        constant   int64_t & ne02 [[buffer(5)]],
+        constant   int64_t & ne10 [[buffer(9)]],
+        constant   int64_t & ne12 [[buffer(11)]],
+        constant   int64_t & ne0  [[buffer(15)]],
+        constant   int64_t & ne1  [[buffer(16)]],
+        constant   uint    & gqa  [[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int ix = tiisg/8;  // 0...3
+    const int it = tiisg%8;  // 0...7
+    const int im = it/4;     // 0 or 1
+    const int ir = it%4;     // 0...3
+
+    const int nb = ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int r2 = tgpig.z;
+    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int first_row = r0 * N_DST;
+    const int ib_row = first_row * nb;
+    const uint offset0 = r2/gqa*(nb*ne0);
+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
+    device const float      * y = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    float yl[16];
+    float yh[16];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int step = sizeof(block_q4_K) * nb / 2;
+
+    device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir;
+
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
+
+    for (int ib = ix; ib < nb; ib += 4) {
+
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
+            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
+            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
+            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
+        }
+
+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
+        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
+        device const half     * dh = &x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            sc16[0] = sc[0] & kmask1;
+            sc16[1] = sc[2] & kmask1;
+            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
+            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
+
+            device const uint16_t * q2 = q1 + 32;
+
+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
+                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
+                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
+                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
+                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
+                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
+                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
+                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
+            }
+
+            float dall = dh[0];
+            float dmin = dh[1];
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
+                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
+                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+
+            q1 += step;
+            sc += step;
+            dh += step;
+        }
+
+        y4 += 4 * QK_K;
+    }
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum;
+        }
+    }
+}
+#else
+kernel void kernel_mul_mv_q4_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int ix = tiisg/4;  // 0...7
+    const int it = tiisg%4;  // 0...3
+
+    const int nb = ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int r2 = tgpig.z;
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int ib_row = first_row * nb;
+    const uint offset0 = r2/gqa*(nb*ne0);
+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
+    device const float      * y = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    float yl[8];
+    float yh[8];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int step = sizeof(block_q4_K) * nb / 2;
+
+    device const float * y4 = y + ix * QK_K + 8 * it;
+
+    uint16_t sc16[4];
+
+    for (int ib = ix; ib < nb; ib += 8) {
+
+        float2 sumy = {0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
+            yh[i] = y4[i+32]; sumy[1] += yh[i];
+        }
+
+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
+        device const half     * dh = x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            sc16[0] = sc[0] & 0x000f;
+            sc16[1] = sc[0] & 0x0f00;
+            sc16[2] = sc[0] & 0x00f0;
+            sc16[3] = sc[0] & 0xf000;
+
+            float2 acc1 = {0.f, 0.f};
+            float2 acc2 = {0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
+                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
+                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
+                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
+            }
+
+            float dall = dh[0];
+            float dmin = dh[1];
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
+                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
+
+            qs += step;
+            sc += step;
+            dh += step;
+        }
+
+        y4 += 8 * QK_K;
+    }
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum;
+        }
+    }
+}
+#endif
+
+kernel void kernel_mul_mv_q5_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int nb = ne00/QK_K;
+
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+    const int r2 = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
+    const uint offset0 = r2/gqa*(nb*ne0);
+    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+
+    float sumf[2]={0.f};
+
+    const int step = sizeof(block_q5_K) * nb;
+
+#if QK_K == 256
+#
+    float yl[16], yh[16];
+
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = tiisg/4;
+    const int ix  = tiisg%4;
+    const int im  = tid/4;
+    const int ir  = tid%4;
+    const int n   = 8;
+
+    const int l0 = n*ir;
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1 = 1u << (2*im);
+    const uint8_t hm2 = hm1 << 1;
+    const uint8_t hm3 = hm1 << 4;
+    const uint8_t hm4 = hm2 << 4;
+
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
+
+    device const float * y1 = yy + ix*QK_K + y_offset;
+
+    for (int i = ix; i < nb; i += 4) {
+
+        device const uint8_t * q1 = x[i].qs + q_offset;
+        device const uint8_t * qh = x[i].qh + l0;
+        device const half * dh = &x[i].d;
+        device const uint16_t * a = (device const uint16_t *)x[i].scales + im;
+
+        device const float * y2 = y1 + 128;
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < 8; ++l) {
+            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
+            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
+            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
+            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
+        }
+
+        for (int row = 0; row < 2; ++row) {
+
+            device const uint8_t * q2 = q1 + 64;
+
+            sc16[0] = a[0] & kmask1;
+            sc16[1] = a[2] & kmask1;
+            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
+            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
+
+            float4 acc1 = {0.f};
+            float4 acc2 = {0.f};
+            for (int l = 0; l < n; ++l) {
+                uint8_t h = qh[l];
+                acc1[0] += yl[l+0] * (q1[l] & 0x0F);
+                acc1[1] += yl[l+8] * (q1[l] & 0xF0);
+                acc1[2] += yh[l+0] * (q2[l] & 0x0F);
+                acc1[3] += yh[l+8] * (q2[l] & 0xF0);
+                acc2[0] += h & hm1 ? yl[l+0] : 0.f;
+                acc2[1] += h & hm2 ? yl[l+8] : 0.f;
+                acc2[2] += h & hm3 ? yh[l+0] : 0.f;
+                acc2[3] += h & hm4 ? yh[l+8] : 0.f;
+            }
+            const float dall = dh[0];
+            const float dmin = dh[1];
+            sumf[row] += dall * (sc8[0] * (acc1[0] +  16.f*acc2[0]) +
+                                 sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
+                                 sc8[4] * (acc1[2] +  16.f*acc2[2]) +
+                                 sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+
+            q1 += step;
+            qh += step;
+            dh += step/2;
+            a  += step/2;
+
+        }
+
+        y1 += 4 * QK_K;
+
+    }
+#else
+    float yl[8], yh[8];
+
+    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
+    const int ix = tiisg%8;
+    const int im = il/8;         // 0, 0, 1, 1
+    const int in = il%8;         // 0, 4, 0, 4
+
+    device const float * y = yy + ix*QK_K + il;
+
+    for (int i = ix; i < nb; i += 8) {
+
+        for (int l = 0; l < 4; ++l) {
+            yl[l+0] = y[l+ 0];
+            yl[l+4] = y[l+16];
+            yh[l+0] = y[l+32];
+            yh[l+4] = y[l+48];
+        }
+
+        device const half * dh = &x[i].d;
+        device const uint8_t * q = x[i].qs + il;
+        device const uint8_t * h = x[i].qh + in;
+        device const int8_t  * s = x[i].scales;
+
+        for (int row = 0; row < 2; ++row) {
+
+            const float d = dh[0];
+
+            float2 acc = {0.f, 0.f};
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t hl = h[l] >> im;
+                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
+                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
+                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
+                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
+            }
+            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
+
+            q += step;
+            h += step;
+            s += step;
+            dh += step/2;
+
+        }
+
+        y += 8 * QK_K;
+    }
+#endif
+
+    for (int row = 0; row < 2; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot;
+        }
+    }
+
+}
+
+kernel void kernel_mul_mv_q6_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const uint8_t kmask1 = 0x03;
+    const uint8_t kmask2 = 0x0C;
+    const uint8_t kmask3 = 0x30;
+    const uint8_t kmask4 = 0xC0;
+
+    const int nb = ne00/QK_K;
+
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+    const int r2 = tgpig.z;
+
+    const int row = 2 * r0 + sgitg;
+    const uint offset0 = r2/gqa*(nb*ne0);
+    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+
+    float sumf = 0;
+
+#if QK_K == 256
+    const int tid  = tiisg/2;
+    const int ix   = tiisg%2;
+    const int ip   = tid/8;         // 0 or 1
+    const int il   = tid%8;
+    const int n    = 4;
+    const int l0   = n*il;
+    const int is   = 8*ip + l0/16;
+
+    const int y_offset = 128*ip + l0;
+    const int q_offset_l = 64*ip + l0;
+    const int q_offset_h = 32*ip + l0;
+
+    for (int i = ix; i < nb; i += 2) {
+
+        device const uint8_t * q1 = x[i].ql + q_offset_l;
+        device const uint8_t * q2 = q1 + 32;
+        device const uint8_t * qh = x[i].qh + q_offset_h;
+        device const int8_t  * sc = x[i].scales + is;
+
+        device const float * y = yy + i * QK_K + y_offset;
+
+        const float dall = x[i].d;
+
+        float4 sums = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < n; ++l) {
+            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
+            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
+            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
+            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
+        }
+
+        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
+
+    }
+
+#else
+    const int ix  = tiisg/4;
+    const int il  = 4*(tiisg%4);
+
+    for (int i = ix; i < nb; i += 8) {
+        device const float * y = yy + i * QK_K + il;
+        device const uint8_t * ql = x[i].ql + il;
+        device const uint8_t * qh = x[i].qh + il;
+        device const int8_t  * s  = x[i].scales;
+
+        const float d = x[i].d;
+
+        float4 sums = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < 4; ++l) {
+            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
+            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
+            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
+            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
+        }
+        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
+    }
+
+#endif
+
+    const float tot = simd_sum(sumf);
+    if (tiisg == 0) {
+        dst[r1*ne0 + r2*ne0*ne1 + row] = tot;
+    }
+}
+
+//============================= templates and their specializations =============================
+
+// NOTE: this is not dequantizing - we are simply fitting the template
+template <typename type4x4>
+void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
+    float4x4 temp = *(((device float4x4 *)src));
+    for (int i = 0; i < 16; i++){
+        reg[i/4][i%4] = temp[i/4][i%4];
+    }
+}
+
+template <typename type4x4>
+void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
+    half4x4 temp = *(((device half4x4 *)src));
+    for (int i = 0; i < 16; i++){
+        reg[i/4][i%4] = temp[i/4][i%4];
+    }
+}
+
+template <typename type4x4>
+void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
+    const float d1 = il ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.h * xb->d;
+    const ushort mask0 = il ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i=0;i<8;i++) {
+        reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
+        reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
+    const float d1 = il ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb->m;
+    const ushort mask0 = il ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i=0;i<8;i++) {
+        reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
+        reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
+    device const int8_t * qs = ((device const int8_t *)xb->qs);
+    const half d = xb->d;
+
+    for (int i=0;i<16;i++) {
+        reg[i/4][i%4] = (qs[i + 16*il] * d);
+    }
+}
+
+template <typename type4x4>
+void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
+    const half d = xb->d;
+    const half min = xb->dmin;
+    device const uint8_t * q = (device const uint8_t *)xb->qs;
+    half dl, ml;
+    uint8_t sc = xb->scales[il];
+
+#if QK_K == 256
+    q = q + 32*(il/8) + 16*(il&1);
+    il = (il/2)%4;
+#endif
+    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
+    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
+    const half d_all = xb->d;
+    device const uint8_t * q = (device const uint8_t *)xb->qs;
+    device const uint8_t * h = (device const uint8_t *)xb->hmask;
+    device const int8_t * scales = (device const int8_t *)xb->scales;
+
+#if QK_K == 256
+    q = q + 32 * (il/8) + 16 * (il&1);
+    h = h + 16 * (il&1);
+    uint8_t m = 1 << (il/2);
+    uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
+                                 ((il/4)>0 ? 12  : 3);
+    uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
+    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
+    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
+                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
+    half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
+    const half ml = 4.h * dl;
+
+    il = (il/2) & 3;
+    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+    const uint8_t mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
+    dl *= coef;
+
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
+    }
+#else
+    float    kcoef = il&1 ? 1.f/16.f : 1.f;
+    uint16_t kmask = il&1 ? 0xF0     : 0x0F;
+    float    dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
+    float    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+    uint8_t  mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
+    uint8_t  m = 1<<(il*2);
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
+    }
+#endif
+}
+
+static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
+    return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
+                 : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
+}
+
+template <typename type4x4>
+void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
+    device const uchar * q = xb->qs;
+
+#if QK_K == 256
+    short is = (il/4) * 2;
+    q = q + (il/4) * 32 + 16 * (il&1);
+    il = il & 3;
+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
+    const half d   = il < 2 ? xb->d : xb->d / 16.h;
+    const half min = xb->dmin;
+    const half dl = d * sc[0];
+    const half ml = min * sc[1];
+#else
+    q = q + 16 * (il&1);
+    device const uint8_t * s = xb->scales;
+    device const half2 * dh = (device const half2 *)xb->d;
+    const float2 d = (float2)dh[0];
+    const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
+    const float ml = il<2 ? d[1] * (s[0]>>4)  : d[1] * (s[1]>>4);
+#endif
+    const ushort mask = il<2 ? 0x0F : 0xF0;
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
+    device const uint8_t * q  = xb->qs;
+    device const uint8_t * qh = xb->qh;
+
+#if QK_K == 256
+    short is = (il/4) * 2;
+    q  = q + 32 * (il/4) + 16 * (il&1);
+    qh = qh + 16 * (il&1);
+    uint8_t ul = 1 << (il/2);
+    il = il & 3;
+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
+    const half d = il < 2 ? xb->d : xb->d / 16.h;
+    const half min = xb->dmin;
+    const half dl = d * sc[0];
+    const half ml = min * sc[1];
+
+    const ushort mask = il<2 ? 0x0F : 0xF0;
+    const half qh_val = il<2 ? 16.h : 256.h;
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
+    }
+#else
+    q = q + 16 * (il&1);
+    device const int8_t * s = xb->scales;
+    const float dl = xb->d * s[il];
+    uint8_t m = 1<<(il*2);
+    const float  coef = il<2 ? 1.f  : 1.f/16.f;
+    const ushort mask = il<2 ? 0x0F : 0xF0;
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
+    }
+#endif
+}
+
+template <typename type4x4>
+void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
+    const half d_all = xb->d;
+    device const uint8_t * ql = (device const uint8_t *)xb->ql;
+    device const uint8_t * qh = (device const uint8_t *)xb->qh;
+    device const int8_t * scales = (device const int8_t *)xb->scales;
+
+#if QK_K == 256
+    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
+    qh = qh + 32*(il/8) + 16*(il&1);
+    half sc = scales[(il%2) + 2 * ((il/2))];
+    il = (il/2) & 3;
+#else
+    ql = ql + 16 * (il&1);
+    half sc = scales[il];
+#endif
+    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
+    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
+    const half        coef = il>1 ? 1.f/16.h          : 1.h;
+    const half ml = d_all * sc * 32.h;
+    const half dl = d_all * sc * coef;
+    for (int i = 0; i < 16; ++i) {
+        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
+                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
+        reg[i/4][i%4] = dl * q - ml;
+    }
+}
+
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
+kernel void kernel_get_rows(
+        device const  void * src0,
+        device const   int * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb1,
+        uint                 tgpig[[threadgroup_position_in_grid]],
+        uint                 tiitg[[thread_index_in_threadgroup]],
+        uint                 tptg[[threads_per_threadgroup]]) {
+    const int i = tgpig;
+    const int r = ((device int32_t *) src1)[i];
+
+    for (int ind = tiitg; ind < ne00/16; ind += tptg) {
+        float4x4 temp;
+        dequantize_func(
+            ((device const block_q *) ((device char *) src0 + r*nb01)) + ind/nl, ind%nl, temp);
+        *(((device float4x4 *) ((device char *) dst + i*nb1)) + ind) = temp;
+    }
+}
+
+#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
+#define BLOCK_SIZE_K 32
+#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
+#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
+#define THREAD_PER_BLOCK 128
+#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
+#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
+#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
+#define SG_MAT_ROW 8
+
+// each block_q contains 16*nl weights
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+kernel void kernel_mul_mm(device const  uchar * src0,
+                          device const  uchar * src1,
+                          device        float * dst,
+                          constant    int64_t & ne00,
+                          constant    int64_t & ne02,
+                          constant    int64_t & nb01,
+                          constant    int64_t & nb02,
+                          constant    int64_t & ne12,
+                          constant    int64_t & nb10,
+                          constant    int64_t & nb11,
+                          constant    int64_t & nb12,
+                          constant    int64_t & ne0,
+                          constant    int64_t & ne1,
+                          constant       uint & gqa,
+                          threadgroup   uchar * shared_memory [[threadgroup(0)]],
+                          uint3                 tgpig[[threadgroup_position_in_grid]],
+                          uint                  tiitg[[thread_index_in_threadgroup]],
+                          uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
+    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
+
+    const uint r0 = tgpig.y;
+    const uint r1 = tgpig.x;
+    const uint im = tgpig.z;
+
+    // if this block is of 64x32 shape or smaller
+    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+
+    // a thread shouldn't load data outside of the matrix
+    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+
+    simdgroup_half8x8  ma[4];
+    simdgroup_float8x8 mb[2];
+    simdgroup_float8x8 c_res[8];
+    for (int i = 0; i < 8; i++){
+        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    }
+
+    short il = (tiitg % THREAD_PER_ROW);
+
+    uint   offset0 = im/gqa*nb02;
+    ushort offset1 = il/nl;
+
+    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
+    device const float   * y = (device const float   *)(src1
+        + nb12 * im
+        + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
+        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+
+    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+        // load data and store to threadgroup memory
+        half4x4 temp_a;
+        dequantize_func(x, il, temp_a);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        #pragma unroll(16)
+        for (int i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+        }
+
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+
+        il = (il + 2 < nl) ? il + 2 : il % 2;
+        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        y += BLOCK_SIZE_K;
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // load matrices from threadgroup memory and conduct outer products
+        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
+        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+
+        #pragma unroll(4)
+        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+            #pragma unroll(4)
+            for (int i = 0; i < 4; i++) {
+                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
+            }
+            simdgroup_barrier(mem_flags::mem_none);
+            #pragma unroll(2)
+            for (int i = 0; i < 2; i++) {
+                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
+            }
+
+            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
+            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+
+            #pragma unroll(8)
+            for (int i = 0; i < 8; i++){
+                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
+            }
+        }
+    }
+
+    if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
+        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
+                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
+        for (int i = 0; i < 8; i++) {
+            simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
+        }
+    } else {
+        // block is smaller than 64x32, we should avoid writing data outside of the matrix
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
+                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
+        for (int i = 0; i < 8; i++) {
+            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
+        if (sgitg == 0) {
+            for (int i = 0; i < n_rows; i++) {
+                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
+                    *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
+                }
+            }
+        }
+    }
+}
+
+#if QK_K == 256
+#define QK_NL 16
+#else
+#define QK_NL 4
+#endif
+
+typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
+                          constant uint64_t &, constant uint64_t &, uint, uint, uint);
+
+template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows<float4x4,   1, dequantize_f32>;
+template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
+template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
+template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
+template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
+
+typedef void (mat_mm_t)(
+        device const  uchar * src0,
+        device const  uchar * src1,
+        device        float * dst,
+        constant    int64_t & ne00,
+        constant    int64_t & ne02,
+        constant    int64_t & nb01,
+        constant    int64_t & nb02,
+        constant    int64_t & ne12,
+        constant    int64_t & nb10,
+        constant    int64_t & nb11,
+        constant    int64_t & nb12,
+        constant    int64_t & ne0,
+        constant    int64_t & ne1,
+        constant       uint & gqa,
+        threadgroup uchar *, uint3, uint, uint);
+
+template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<float4x4,   1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
diff --git a/stable-diffusion.cpp/ggml/src/ggml-opencl.cpp b/stable-diffusion.cpp/ggml/src/ggml-opencl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a331f24a92ae340f7c910f219344d8d0467d7af
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-opencl.cpp
@@ -0,0 +1,1931 @@
+#include "ggml-opencl.h"
+
+#include <array>
+#include <atomic>
+#include <sstream>
+#include <vector>
+#include <limits>
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define CL_DMMV_BLOCK_SIZE 32
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 1
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+static std::string program_source = MULTILINE_QUOTE(
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+struct __attribute__ ((packed)) block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+struct __attribute__ ((packed)) block_q4_1
+{
+    half d;
+    half m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+
+struct __attribute__ ((packed)) block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+
+struct __attribute__ ((packed)) block_q8_0
+{
+    half d;
+    int8_t qs[QK8_0];
+};
+
+struct __attribute__((packed)) block_q2_K
+{
+    uint8_t scales[16];
+    uint8_t qs[64];
+    half d;
+    half dmin;
+};
+
+struct __attribute__((packed)) block_q3_K
+{
+    uint8_t hmask[32];
+    uint8_t qs[64];
+    uint8_t scales[12];
+    half d;
+};
+
+struct __attribute__((packed)) block_q4_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qs[128];
+};
+
+struct __attribute__((packed)) block_q5_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qh[32];
+    uint8_t qs[128];
+};
+
+struct __attribute__((packed)) block_q6_K
+{
+    uint8_t ql[128];
+    uint8_t qh[64];
+    int8_t scales[16];
+    half d;
+};
+
+__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
+    const uint i = get_global_id(0);
+
+    y[i] = vload_half(0, &x[i]);
+}
+
+void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = (vi0 - 8)*d;
+    *v1 = (vi1 - 8)*d;
+}
+void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+    const float m = vload_half(0, &x[ib].m);
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = vi0*d + m;
+    *v1 = vi1*d + m;
+}
+void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+
+    *v0 = x0*d;
+    *v1 = x1*d;
+}
+void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+    const float m = vload_half(0, &x[ib].m);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    *v0 = x0*d + m;
+    *v1 = x1*d + m;
+}
+void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+
+    *v0 = vi0*d;
+    *v1 = vi1*d;
+}
+void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
+    *v0 = vload_half(0, &x[ib + 0]);
+    *v1 = vload_half(0, &x[ib + 1]);
+}
+);
+
+static std::string k_quants_source = MULTILINE_QUOTE(
+inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
+{
+    if (j < 4)
+    {
+        *d = q[j] & 63;
+        *m = q[j + 4] & 63;
+    }
+    else
+    {
+        *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
+        *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
+    }
+}
+
+__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int n = tid / 32;
+    const int l = tid - 32 * n;
+    const int is = 8 * n + l / 16;
+
+    const uint8_t q = x[i].qs[32 * n + l];
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
+
+    const float dall = vload_half(0, &x[i].d);
+    const float dmin = vload_half(0, &x[i].dmin);
+
+    y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
+    y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
+    y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
+    y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
+}
+
+__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
+{
+    int r = get_local_id(0) / 4;
+    int i = get_group_id(0) + get_global_offset(0);
+    int tid = r / 2;
+    int is0 = r % 2;
+    int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
+    int n = tid / 4;
+    int j = tid - 4 * n;
+
+    uint8_t m = 1 << (4 * n + j);
+    int is = 8 * n + 2 * j + is0;
+    int shift = 2 * j;
+
+    int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
+              : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
+              : is < 12  ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
+              : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
+    float d_all = vload_half(0, &x[i].d);
+    float dl = d_all * (us - 32);
+
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
+    const __global uint8_t *q = x[i].qs + 32 * n;
+    const __global uint8_t *hm = x[i].hmask;
+
+    for (int l = l0; l < l0 + 4; ++l)
+        y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+}
+
+__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int il = tid / 8;
+    const int ir = tid % 8;
+    const int is = 2 * il;
+    const int n = 4;
+
+    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
+
+    const float dall = vload_half(0, &x[i].d);
+    const float dmin = vload_half(0, &x[i].dmin);
+
+    __global const uint8_t *q = x[i].qs + 32 * il + n * ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+    float d1 = dall * sc;
+    float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+    float d2 = dall * sc;
+    float m2 = dmin * m;
+    for (int l = 0; l < n; ++l)
+    {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l + 32] = d2 * (q[l] >> 4) - m2;
+    }
+}
+
+__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int il = tid / 16;
+    const int ir = tid % 16;
+    const int is = 2 * il;
+
+    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
+
+    const float dall = vload_half(0, &x[i].d);
+    const float dmin = vload_half(0, &x[i].dmin);
+
+    __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
+    __global const uint8_t *qh = x[i].qh + 2 * ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+    const float d1 = dall * sc;
+    const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+    const float d2 = dall * sc;
+    const float m2 = dmin * m;
+
+    uint8_t hm = 1 << (2 * il);
+    y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
+    y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
+}
+
+__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
+{
+    const int i = get_group_id(0) + get_global_offset(0);
+    const int tid = get_local_id(0);
+    const int ip = tid / 32;
+    const int il = tid - 32 * ip;
+    const int is = 8 * ip + il / 16;
+
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
+
+    const float d = vload_half(0, &x[i].d);
+
+    __global const uint8_t *ql = x[i].ql + 64 * ip + il;
+    const uint8_t qh = x[i].qh[32 * ip + il];
+    __global const int8_t *sc = x[i].scales + is;
+
+    y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+}
+
+__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+
+    const int row = get_group_id(0);
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    __global const struct block_q2_K * x = xx + ib0;
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    tmp[16 * ix + tid] = 0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const float   * y = yy + i * QK_K + y_offset;
+        __global const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = vload_half(0, &x[i].d);
+        const float dmin = vload_half(0, &x[i].dmin);
+
+        __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int row = get_group_id(0);
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    __global const struct block_q3_K * x = xx + ib0;
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    tmp[16 * ix + tid] = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const float   * y  = yy + i * QK_K + y_offset;
+        __global const uint8_t * q = x[i].qs + q_offset;
+        __global const uint8_t * h = x[i].hmask + l0;
+
+        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = vload_half(0, &x[i].d);
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp[16 * ix + tid] += d * sum;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+
+    //to rename it later, just to test now
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int row = get_group_id(0);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
+
+    const int step = 8/K_QUANTS_PER_ITERATION;
+
+    const int il  = tid/step;     // 0...3
+    const int ir  = tid - step*il;// 0...3
+    const int n   = 2*K_QUANTS_PER_ITERATION;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    __global const struct block_q4_K * x = xx + ib0;
+
+    tmp[16 * ix + tid] = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const uint8_t * q1 = x[i].qs + q_offset;
+        __global const uint8_t * q2 = q1 + 64;
+        __global const float   * y1 = yy + i*QK_K + y_offset;
+        __global const float   * y2 = y1 + 128;
+
+        const float dall = vload_half(0, &x[i].d);
+        const float dmin = vload_half(0, &x[i].dmin);
+
+        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        float4 s = (float4)(0.f);
+        float smin = 0;
+        for (int l = 0; l < n; ++l) {
+            s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
+            s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
+
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int row = get_group_id(0);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const int tid = get_local_id(0)/2;  // 0...15
+    const int ix  = get_local_id(0)%2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    __global const struct block_q5_K * x = xx + ib0;
+
+    tmp[16 * ix + tid] = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        __global const uint8_t * ql1 = x[i].qs + q_offset;
+        __global const uint8_t * ql2 = ql1 + 64;
+        __global const uint8_t * qh  = x[i].qh + l0;
+        __global const float   * y1  = yy + i*QK_K + y_offset;
+        __global const float   * y2  = y1 + 128;
+
+        const float dall = vload_half(0, &x[i].d);
+        const float dmin = vload_half(0, &x[i].dmin);
+
+        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        float4 sum = (float4)(0.f);
+        float smin = 0;
+        for (int l = 0; l < n; ++l) {
+            sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+                   + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
+            sum.y += y1[l+32] * ((ql1[l+ 0] >>  4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+                   + y1[l+48] * ((ql1[l+16] >>  4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
+            sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+                   + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
+            sum.w += y2[l+32] * ((ql2[l+ 0] >>  4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+                   + y2[l+48] * ((ql2[l+16] >>  4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
+
+    const int row = get_group_id(0);
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    __global const struct block_q6_K * x = xx + ib0;
+
+    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+\n#if K_QUANTS_PER_ITERATION == 1\n
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+
+\n#else\n
+
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+
+\n#endif\n
+
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    tmp[16 * ix + tid] = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        __global const float   * y  = yy + i * QK_K + y_offset;
+        __global const uint8_t * ql = x[i].ql + ql_offset;
+        __global const uint8_t * qh = x[i].qh + qh_offset;
+        __global const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = vload_half(0, &x[i].d);
+
+\n#if K_QUANTS_PER_ITERATION == 1\n
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp[16 * ix + tid] += sum;
+\n#else\n
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp[16 * ix + tid] += sum;
+\n#endif\n
+
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=16; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+);
+
+
+std::string dequant_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
+    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
+
+    if (i >= get_global_size(0)) {
+        return;
+    }
+
+    const uint qk = QUANT_K;
+    const uint qr = QUANT_R;
+
+    const int ib = i/qk + get_global_offset(0); // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    float v0, v1;
+    DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
+    y[iybs + iqs + 0] = v0;
+    y[iybs + iqs + y_offset] = v1;
+}
+);
+
+std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_group_id(0);
+    const int tid = get_local_id(0);
+
+    const uint qk = QUANT_K;
+    const uint qr = QUANT_R;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+);
+
+
+std::string mul_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
+    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
+
+    if (i >= get_global_size(0)) {
+        return;
+    }
+
+    dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
+}
+);
+
+#define CL_CHECK(err)                                               \
+    do {                                                            \
+        cl_int err_ = (err);                                        \
+        if (err_ != CL_SUCCESS) {                                   \
+            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
+
+#define CLBLAST_CHECK(err)                                          \
+    do {                                                            \
+        CLBlastStatusCode err_ = (err);                             \
+        if (err_ != CLBlastSuccess) {                               \
+            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
+
+std::array<std::string, 5> dequant_str_keys = {
+    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
+};
+
+std::array<std::string, 30> dequant_str_values = {
+    "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
+    "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
+    "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
+    "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
+    "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+    "convert_row_f16", "half", "1", "1", "convert_f16"
+};
+
+std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
+    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
+    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
+    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
+    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
+    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
+};
+
+std::array<std::string, 2> mul_str_keys = {
+    "KERNEL_NAME", "TYPE"
+};
+std::array<std::string, 2> mul_str_values = {
+    "mul_f32", "float"
+};
+
+static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
+    size_t pos = 0;
+    while ((pos = s.find(from, pos)) != std::string::npos) {
+         s.replace(pos, from.length(), to);
+         pos += to.length();
+    }
+    return s;
+}
+
+static std::string generate_kernels() {
+    std::stringstream src;
+    src << program_source << '\n';
+    src << k_quants_source << '\n';
+    for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
+        std::string dequant_kernel = dequant_template;
+        std::string dmmv_kernel = dequant_mul_mat_vec_template;
+        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
+            replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
+            replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
+        }
+        src << dequant_kernel << '\n';
+        src << dmmv_kernel << '\n';
+    }
+    for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
+        std::string mul_kernel = mul_template;
+        for (size_t j = 0; j < mul_str_keys.size(); j++) {
+            replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
+        }
+        src << mul_kernel << '\n';
+    }
+
+    return src.str();
+}
+
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel convert_row_f16_cl;
+static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
+static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
+static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
+static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
+static cl_kernel mul_f32_cl;
+static bool fp16_support;
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size;
+    size_t log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+
+    std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
+                               "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
+                               "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
+
+    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
+    if(err < 0) {
+
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+void ggml_cl_init(void) {
+    cl_int err;
+
+    struct cl_device;
+    struct cl_platform {
+        cl_platform_id id;
+        unsigned number;
+        char name[128];
+        char vendor[128];
+        struct cl_device * devices;
+        unsigned n_devices;
+        struct cl_device * default_device;
+    };
+
+    struct cl_device {
+        struct cl_platform * platform;
+        cl_device_id id;
+        unsigned number;
+        cl_device_type type;
+        char name[128];
+    };
+
+    enum { NPLAT = 16, NDEV = 16 };
+
+    struct cl_platform platforms[NPLAT];
+    unsigned n_platforms = 0;
+    struct cl_device devices[NDEV];
+    unsigned n_devices = 0;
+    struct cl_device * default_device = NULL;
+
+    platform = NULL;
+    device = NULL;
+
+    cl_platform_id platform_ids[NPLAT];
+    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
+
+    for (unsigned i = 0; i < n_platforms; i++) {
+        struct cl_platform * p = &platforms[i];
+        p->number = i;
+        p->id = platform_ids[i];
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
+
+        cl_device_id device_ids[NDEV];
+        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
+        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
+            p->n_devices = 0;
+        } else {
+            CL_CHECK(clGetDeviceIDsError);
+        }
+        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
+        p->default_device = NULL;
+
+        for (unsigned j = 0; j < p->n_devices; j++) {
+            struct cl_device * d = &devices[n_devices];
+            d->number = n_devices++;
+            d->id = device_ids[j];
+            d->platform = p;
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+
+            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
+                p->default_device = d;
+            }
+        }
+
+        if (default_device == NULL && p->default_device != NULL) {
+            default_device = p->default_device;
+        }
+    }
+
+    if (n_devices == 0) {
+        fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
+        exit(1);
+    }
+
+    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+    int user_platform_number = -1;
+    int user_device_number = -1;
+
+    unsigned n;
+    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
+        user_platform_number = (int)n;
+    }
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
+        user_device_number = (int)n;
+    }
+    if (user_platform_number != -1 && user_device_number != -1) {
+        cl_platform* platform = &platforms[user_platform_number];
+        if ((unsigned)user_device_number >= platform->n_devices) {
+            fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number);
+            exit(1);
+        }
+        default_device = &platform->devices[user_device_number];
+    } else {
+
+        struct cl_device * selected_devices = devices;
+        unsigned n_selected_devices = n_devices;
+
+        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
+            for (unsigned i = 0; i < n_platforms; i++) {
+                struct cl_platform * p = &platforms[i];
+                if (strstr(p->name, user_platform_string) != NULL ||
+                    strstr(p->vendor, user_platform_string) != NULL) {
+                    user_platform_number = (int)i;
+                    break;
+                }
+            }
+            if (user_platform_number == -1) {
+                fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
+                exit(1);
+            }
+        }
+        if (user_platform_number != -1) {
+            struct cl_platform * p = &platforms[user_platform_number];
+            selected_devices = p->devices;
+            n_selected_devices = p->n_devices;
+            default_device = p->default_device;
+            if (n_selected_devices == 0) {
+                fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+                exit(1);
+            }
+        }
+
+        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
+            for (unsigned i = 0; i < n_selected_devices; i++) {
+                struct cl_device * d = &selected_devices[i];
+                if (strstr(d->name, user_device_string) != NULL) {
+                    user_device_number = d->number;
+                    break;
+                }
+            }
+            if (user_device_number == -1) {
+                fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
+                exit(1);
+            }
+        }
+        if (user_device_number != -1) {
+            selected_devices = &devices[user_device_number];
+            n_selected_devices = 1;
+            default_device = &selected_devices[0];
+        }
+
+        GGML_ASSERT(n_selected_devices > 0);
+
+        if (default_device == NULL) {
+            default_device = &selected_devices[0];
+        }
+    }
+
+    fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
+    fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
+    if (default_device->type != CL_DEVICE_TYPE_GPU) {
+        fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    }
+
+    platform = default_device->platform->id;
+    device = default_device->id;
+
+    size_t ext_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+    char *ext_buffer = (char *)alloca(ext_str_size + 1);
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+    // Check if ext_buffer contains cl_khr_fp16
+    fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
+
+    cl_context_properties properties[] = {
+        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
+    };
+
+    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+
+    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
+        (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
+        (queue = clCreateCommandQueue(context, device, 0, &err), err)
+    )));
+
+    const std::string kernel_src = generate_kernels();
+
+    program = build_program_from_source(context, device, kernel_src.c_str());
+
+    // FP16 to FP32 kernel
+    CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
+
+    // Dequantize kernels
+    CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
+    CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
+    CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
+    CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
+    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
+    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
+    CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
+    CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
+    CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
+    CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
+    CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
+
+    // dequant mul mat kernel
+    CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
+    CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
+
+    // mul kernel
+    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
+}
+
+static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return &dequantize_row_q4_0_cl;
+        case GGML_TYPE_Q4_1:
+            return &dequantize_row_q4_1_cl;
+        case GGML_TYPE_Q5_0:
+            return &dequantize_row_q5_0_cl;
+        case GGML_TYPE_Q5_1:
+            return &dequantize_row_q5_1_cl;
+        case GGML_TYPE_Q8_0:
+            return &dequantize_row_q8_0_cl;
+        case GGML_TYPE_Q2_K:
+            return &dequantize_block_q2_k_cl;
+        case GGML_TYPE_Q3_K:
+            return &dequantize_block_q3_k_cl;
+        case GGML_TYPE_Q4_K:
+            return &dequantize_block_q4_k_cl;
+        case GGML_TYPE_Q5_K:
+            return &dequantize_block_q5_k_cl;
+        case GGML_TYPE_Q6_K:
+            return &dequantize_block_q6_k_cl;
+        case GGML_TYPE_F16:
+            return &convert_row_f16_cl;
+        default:
+            return nullptr;
+    }
+}
+
+static size_t ggml_cl_global_denom(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+            return 4;
+        case GGML_TYPE_Q4_K:
+            return 8;
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return 4;
+        case GGML_TYPE_F16:
+        default:
+            return 1;
+    }
+}
+
+static size_t ggml_cl_local_size(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 0;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+            return 64;
+        case GGML_TYPE_Q4_K:
+            return 32;
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return 64;
+        case GGML_TYPE_F16:
+        default:
+            return 0;
+    }
+}
+
+static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return &dequantize_mul_mat_vec_q4_0_cl;
+        case GGML_TYPE_Q4_1:
+            return &dequantize_mul_mat_vec_q4_1_cl;
+        case GGML_TYPE_Q5_0:
+            return &dequantize_mul_mat_vec_q5_0_cl;
+        case GGML_TYPE_Q5_1:
+            return &dequantize_mul_mat_vec_q5_1_cl;
+        case GGML_TYPE_Q8_0:
+            return &dequantize_mul_mat_vec_q8_0_cl;
+        case GGML_TYPE_F16:
+            return &convert_mul_mat_vec_f16_cl;
+        case GGML_TYPE_Q2_K:
+            return &dequantize_mul_mat_vec_q2_K_cl;
+        case GGML_TYPE_Q3_K:
+            return &dequantize_mul_mat_vec_q3_K_cl;
+        case GGML_TYPE_Q4_K:
+            return &dequantize_mul_mat_vec_q4_K_cl;
+        case GGML_TYPE_Q5_K:
+            return &dequantize_mul_mat_vec_q5_K_cl;
+        case GGML_TYPE_Q6_K:
+            return &dequantize_mul_mat_vec_q6_K_cl;
+        default:
+            return nullptr;
+    }
+}
+
+// buffer pool for cl
+#define MAX_CL_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cl_buffer {
+    cl_mem mem;
+    size_t size = 0;
+};
+
+static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
+static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
+
+static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+    cl_int err;
+
+    int best_i = -1;
+    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
+    int worst_i = -1;
+    size_t worst_size = 0; //largest unused buffer seen so far
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer &b = g_cl_buffer_pool[i];
+        if (b.size > 0 && b.size >= size && b.size < best_size)
+        {
+            best_i = i;
+            best_size = b.size;
+        }
+        if (b.size > 0 && b.size > worst_size)
+        {
+            worst_i = i;
+            worst_size = b.size;
+        }
+    }
+    if(best_i!=-1) //found the smallest buffer that fits our needs
+    {
+        cl_buffer& b = g_cl_buffer_pool[best_i];
+        cl_mem mem = b.mem;
+        *actual_size = b.size;
+        b.size = 0;
+        return mem;
+    }
+    if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
+    {
+         cl_buffer& b = g_cl_buffer_pool[worst_i];
+         cl_mem mem = b.mem;
+         b.size = 0;
+         clReleaseMemObject(mem);
+    }
+    cl_mem mem;
+    CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
+    *actual_size = size;
+    return mem;
+}
+
+static void ggml_cl_pool_free(cl_mem mem, size_t size) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer& b = g_cl_buffer_pool[i];
+        if (b.size == 0) {
+            b.mem = mem;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
+    clReleaseMemObject(mem);
+}
+
+void ggml_cl_free_data(const struct ggml_tensor* tensor) {
+    if (tensor->backend != GGML_BACKEND_GPU) {
+        return;
+    }
+
+    cl_mem mem = (cl_mem)tensor->extra;
+    clReleaseMemObject(mem);
+}
+
+static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
+    cl_int err;
+    const uint64_t ne0 = src->ne[0];
+    const uint64_t ne1 = src->ne[1];
+    const uint64_t nb0 = src->nb[0];
+    const uint64_t nb1 = src->nb[1];
+    const uint64_t nb2 = src->nb[2];
+    const uint64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const size_t ts = ggml_type_size(type);
+    const size_t bs = ggml_blck_size(type);
+    const uint64_t row_size = ts*ne0/bs;
+
+    const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == row_size) {
+        return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
+    }
+    if (nb0 == ts) {
+        const size_t buffer_origin[3] = { offset, 0, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { row_size, ne1, 1 };
+        return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
+    }
+    std::vector<cl_event> events;
+    if (ev && ne1>1) events.reserve(ne1-1);
+    for (uint64_t i1 = 0; i1 < ne1; i1++) {
+        // pretend the row is a matrix with cols=1
+        const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { ts, ne0/bs, 1 };
+        // if an event is requested, make the last write wait for all previous writes to complete
+        if (ev && i1) {
+            events.push_back(*ev);
+        }
+        cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
+        if (err != CL_SUCCESS) {
+            for (auto event : events) {
+                clReleaseEvent(event);
+            }
+            return err;
+        }
+    }
+    for (auto event : events) {
+        CL_CHECK(clReleaseEvent(event));
+    }
+    return CL_SUCCESS;
+}
+
+static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int64_t nb10 = src1->nb[0];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    size_t x_size;
+    size_t d_size;
+
+    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
+    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
+    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
+
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            const int i0 = i03*ne02 + i02;
+
+            cl_event ev;
+
+            // copy src0 to device
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
+
+            if (nb10 == sizeof(float)) {
+                // Contiguous, avoid overhead from queueing many kernel runs
+                const int64_t i13 = i03%ne13;
+                const int64_t i12 = i02%ne12;
+                const int i1 = i13*ne12*ne11 + i12*ne11;
+
+                cl_int x_offset = 0;
+                cl_int y_offset = i1*ne10;
+                cl_int d_offset = 0;
+
+                size_t global = ne00 * ne01;
+                cl_int ky = ne10;
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+            } else {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const int64_t i13 = i03%ne13;
+                    const int64_t i12 = i02%ne12;
+                    const int64_t i11 = i01%ne11;
+                    const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
+
+                    cl_int x_offset = i01*ne00;
+                    cl_int y_offset = i1*ne10;
+                    cl_int d_offset = i01*ne00;
+
+                    // compute
+                    size_t global = ne00;
+                    cl_int ky = ne10;
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+                    CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+                    CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+                }
+            }
+
+            CL_CHECK(clReleaseEvent(ev));
+            CL_CHECK(clFinish(queue));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
+        }
+    }
+    ggml_cl_pool_free(d_X, x_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cl_mul_f32(src0, src1, dst);
+}
+
+static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    cl_mem d_X;
+    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
+        d_X = (cl_mem) src0->extra;
+    } else {
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
+    }
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+
+    size_t x_offset = 0;
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
+            // copy data to device
+            if (src0->backend == GGML_BACKEND_GPU) {
+                x_offset = (i03 * ne02 + i02) * x_ne;
+            } else if (i02 != pi02 || i03 != pi03) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                pi02 = i02;
+                pi03 = i03;
+            }
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+            CL_CHECK(clFinish(queue));
+
+            // compute
+            cl_event ev_sgemm;
+            clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                       ne01, ne11, ne10,
+                                                       alpha,
+                                                       d_X, x_offset, ne00,
+                                                       d_Y, 0, ne10,
+                                                       beta,
+                                                       d_D, 0, ne01,
+                                                       &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+        }
+    }
+
+    if (src0->backend != GGML_BACKEND_GPU) {
+        ggml_cl_pool_free(d_X, x_size);
+    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+    GGML_ASSERT(fp16_support);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
+    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    cl_mem d_X;
+    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
+        d_X = (cl_mem) src0->extra;
+    } else {
+        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
+    }
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
+
+    bool src1_cont_rows = nb10 == sizeof(float);
+    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
+
+    size_t x_offset = 0;
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
+            // copy src0 to device
+            if (src0->backend == GGML_BACKEND_GPU) {
+                x_offset = (i03 * ne02 + i02) * x_ne;
+            } else if (i02 != pi02 || i03 != pi03) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                pi02 = i02;
+                pi03 = i03;
+            }
+
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
+            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
+            if (src1_cont_rows) {
+                if (src1_cont_cols) {
+                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                }
+                else {
+                    for (int64_t i11 = 0; i11 < ne11; i11++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+                    }
+                }
+            }
+            else {
+                for (int64_t i11 = 0; i11 < ne11; i11++) {
+                    for (int64_t i10 = 0; i10 < ne10; i10++) {
+                        // very slow due to no inlining
+                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                    }
+                }
+            }
+
+            // copy src1 to device
+            CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
+
+            CL_CHECK(clFinish(queue));
+
+            // compute
+            cl_event ev_sgemm;
+            clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
+                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                       ne01, ne11, ne10,
+                                                       alpha,
+                                                       d_X, x_offset, ne00,
+                                                       d_Y, 0, ne10,
+                                                       beta,
+                                                       d_D, 0, ne01,
+                                                       &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host, then convert to float
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+            ggml_fp16_to_fp32_row(tmp, d, d_ne);
+        }
+    }
+
+    if (src0->backend != GGML_BACKEND_GPU) {
+        ggml_cl_pool_free(d_X, x_size);
+    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    const ggml_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1;
+
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
+    const size_t q_sz = ggml_type_size(type) * x_bps;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    size_t q_size;
+    cl_mem d_X;
+    if (!mul_mat_vec) {
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
+    }
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+    cl_mem d_Q;
+    if (src0->backend == GGML_BACKEND_CPU) {
+        d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
+    }
+
+    cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
+    cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
+    GGML_ASSERT(to_fp32_cl != nullptr);
+
+    const size_t global_denom = ggml_cl_global_denom(type);
+    const size_t local = ggml_cl_local_size(type);
+
+    size_t ev_idx = 0;
+    std::vector<cl_event> events;
+
+    int64_t pi02 = -1;
+    int64_t pi03 = -1;
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        int64_t i03 = i13 / r3;
+
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            int64_t i02 = i12 / r2;
+
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
+                if (i02 != pi02 || i03 != pi03) {
+                    events.emplace_back();
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
+                    pi02 = i02;
+                    pi03 = i03;
+                }
+            } else if (src0->backend == GGML_BACKEND_GPU) {
+                d_Q = (cl_mem) src0->extra;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                // copy src1 to device
+                events.emplace_back();
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
+
+                // compute
+                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
+                const size_t local = CL_DMMV_BLOCK_SIZE;
+                const cl_int ncols = ne00;
+                events.emplace_back();
+                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
+                CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
+                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
+                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
+                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
+                // convert src0 to fp32 on device
+                const size_t global = x_ne / global_denom;
+                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
+
+                // copy src1 to device
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+                events.emplace_back();
+
+                // wait for conversion
+                CL_CHECK(clFinish(queue));
+
+                // compute
+                clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                           clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                           ne01, ne11, ne10,
+                                                           alpha,
+                                                           d_X, 0, ne00,
+                                                           d_Y, 0, ne10,
+                                                           beta,
+                                                           d_D, 0, ne01,
+                                                           &queue, events.data() + ev_idx++);
+
+                if (status != clblast::StatusCode::kSuccess) {
+                    GGML_ASSERT(false);
+                }
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
+            for (auto *event : events) {
+                clReleaseEvent(event);
+            }
+
+            ev_idx = 0;
+            events.clear();
+        }
+    }
+
+    if (!mul_mat_vec) {
+        ggml_cl_pool_free(d_X, x_size);
+    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+    if (src0->backend == GGML_BACKEND_CPU) {
+        ggml_cl_pool_free(d_Q, q_size);
+    }
+}
+
+
+bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        src1->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32 &&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
+        return true;
+    }
+
+    return false;
+}
+
+static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+    // If device doesn't support FP16
+    if (!fp16_support) {
+        return false;
+    }
+
+    size_t src0_sz = ggml_nbytes(src0);
+    size_t src1_sz = ggml_nbytes(src1);
+
+    // mul_mat_q: src0 is converted to fp32 on device
+    size_t mul_mat_q_transfer = src0_sz + src1_sz;
+
+    // mul_mat_f16: src1 is converted to fp16 on cpu
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
+
+    // choose the smaller one to transfer to the device
+    // TODO: this is not always the best choice due to the overhead of converting to fp16
+    return mul_mat_f16_transfer < mul_mat_q_transfer;
+}
+
+void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
+    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+
+    if (src0->type == GGML_TYPE_F32) {
+        ggml_cl_mul_mat_f32(src0, src1, dst);
+    }
+    else if (src0->type == GGML_TYPE_F16) {
+        if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
+        }
+        else {
+            ggml_cl_mul_mat_q_f32(src0, src1, dst);
+        }
+    }
+    else if (ggml_is_quantized(src0->type)) {
+        ggml_cl_mul_mat_q_f32(src0, src1, dst);
+    }
+    else {
+        GGML_ASSERT(false);
+    }
+}
+
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    }
+    return 0;
+}
+
+void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+
+    const ggml_type type = tensor->type;
+    const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
+    const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
+
+    size_t q_size;
+    cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
+
+    tensor->data = data;
+    // copy tensor to device
+    size_t offset = 0;
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
+            offset += s_sz;
+        }
+    }
+
+    CL_CHECK(clFinish(queue));
+
+    tensor->extra = dst;
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+}
diff --git a/stable-diffusion.cpp/ggml/src/ggml-opencl.h b/stable-diffusion.cpp/ggml/src/ggml-opencl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a92b445c9d7660da6ed5dfcbc4cce9ae7a5b9827
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml-opencl.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void ggml_cl_init(void);
+
+void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);
+
+void ggml_cl_free_data(const struct ggml_tensor* tensor);
+
+void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/stable-diffusion.cpp/ggml/src/ggml.c b/stable-diffusion.cpp/ggml/src/ggml.c
new file mode 100644
index 0000000000000000000000000000000000000000..512b736cea4406614e663d5b4bc750a48d70c0bf
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/src/ggml.c
@@ -0,0 +1,22489 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+
+#include "ggml.h"
+
+#ifdef GGML_USE_K_QUANTS
+#include "k_quants.h"
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h> // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <float.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <signal.h>
+
+#ifdef GGML_USE_METAL
+#include <unistd.h>
+#endif
+
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+
+// disable POSIX deprecation warnigns
+// these functions are never going away, anyway
+#pragma warning(disable: 4996)
+#endif
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+
+static void atomic_store(atomic_int * ptr, LONG val) {
+    InterlockedExchange(ptr, val);
+}
+static LONG atomic_load(atomic_int * ptr) {
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
+    return InterlockedExchangeAdd(ptr, inc);
+}
+static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
+    return atomic_fetch_add(ptr, -(dec));
+}
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
+    (void) unused;
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+    if (handle == NULL)
+    {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void * unused) {
+    (void) unused;
+    int ret = (int) WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+    return ret;
+}
+
+static int sched_yield (void) {
+    Sleep (0);
+    return 0;
+}
+#else
+#include <pthread.h>
+#include <stdatomic.h>
+
+typedef void * thread_ret_t;
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#endif
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
+#endif
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#endif
+
+/*#define GGML_PERF*/
+#define GGML_DEBUG 0
+#define GGML_GELU_FP16
+#define GGML_GELU_QUICK_FP16
+#define GGML_SILU_FP16
+// #define GGML_CROSS_ENTROPY_EXP_FP16
+// #define GGML_FLASH_ATTN_EXP_FP16
+
+#define GGML_SOFT_MAX_UNROLL 4
+#define GGML_VEC_DOT_UNROLL  2
+#define GGML_VEC_MAD_UNROLL  32
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+//
+// end of logging block
+//
+
+#ifdef GGML_USE_ACCELERATE
+// uncomment to use vDSP for soft max computation
+// note: not sure if it is actually faster
+//#define GGML_SOFT_MAX_ACCELERATE
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
+#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
+#else
+inline static void * ggml_aligned_malloc(size_t size) {
+    if (size == 0) {
+        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+        return NULL;
+    }
+    void * aligned_memory = NULL;
+#ifdef GGML_USE_CPU_HBM
+    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
+    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
+#else
+    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+#endif
+    if (result != 0) {
+        // Handle allocation failure
+        const char *error_desc = "unknown allocation error";
+        switch (result) {
+            case EINVAL:
+                error_desc = "invalid alignment value";
+                break;
+            case ENOMEM:
+                error_desc = "insufficient memory";
+                break;
+        }
+        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
+        return NULL;
+    }
+    return aligned_memory;
+}
+#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#ifdef GGML_USE_CPU_HBM
+#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+#else
+#define GGML_ALIGNED_FREE(ptr)    free(ptr)
+#endif
+#endif
+
+
+size_t dynamic_mem_size = 0;
+size_t max_dynamic_mem_size = 0;
+
+size_t curr_max_dynamic_mem_size = 0;
+
+inline static void* ggml_dynamic_malloc(size_t size) {
+    void *ptr = GGML_ALIGNED_MALLOC(GGML_MEM_ALIGN + size);
+    dynamic_mem_size += size;
+    if (dynamic_mem_size > max_dynamic_mem_size) {
+        max_dynamic_mem_size = dynamic_mem_size;
+    }
+    if (dynamic_mem_size > curr_max_dynamic_mem_size) {
+        curr_max_dynamic_mem_size = dynamic_mem_size;
+    }
+    *((size_t*)ptr) = size;
+    return (char*)ptr + GGML_MEM_ALIGN;
+}
+
+inline static void ggml_dynamic_free(void * ptr) {
+    void* realptr = (char*)ptr-GGML_MEM_ALIGN;
+    size_t size = *((size_t*)realptr);
+    dynamic_mem_size -= size;
+    GGML_ALIGNED_FREE(realptr);
+}
+
+size_t  ggml_dynamic_size(void) {
+    return dynamic_mem_size;
+}
+
+size_t  ggml_max_dynamic_size(void) {
+    return max_dynamic_mem_size;
+}
+
+size_t  ggml_curr_max_dynamic_size(void) {
+    return curr_max_dynamic_mem_size;
+}
+
+void  ggml_reset_curr_max_dynamic_size(void) {
+    curr_max_dynamic_mem_size = dynamic_mem_size;
+}
+
+#define GGML_DYNAMIC_MALLOC(size)  ggml_dynamic_malloc(size)
+#define GGML_DYNAMIC_FREE(ptr)     ggml_dynamic_free(ptr)
+
+#define UNUSED GGML_UNUSED
+#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
+
+//
+// tensor access macros
+//
+
+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
+#include "ggml-opencl.h"
+#endif
+#elif defined(GGML_USE_OPENBLAS)
+#if defined(GGML_BLAS_USE_MKL)
+#include <mkl.h>
+#else
+#include <cblas.h>
+#endif
+#elif defined(GGML_USE_CUBLAS)
+#include "ggml-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
+#endif
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// floating point type used to accumulate sums
+typedef double ggml_float;
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
+#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+
+#define GGML_FP16_TO_FP32(x) ((float) (x))
+#define GGML_FP32_TO_FP16(x) (x)
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON
+
+//
+// global data
+//
+
+// precomputed gelu table for f16 (128 KB)
+static ggml_fp16_t table_gelu_f16[1 << 16];
+
+// precomputed quick gelu table for f16 (128 KB)
+static ggml_fp16_t table_gelu_quick_f16[1 << 16];
+
+// precomputed silu table for f16 (128 KB)
+static ggml_fp16_t table_silu_f16[1 << 16];
+
+// precomputed exp table for f16 (128 KB)
+static ggml_fp16_t table_exp_f16[1 << 16];
+
+// precomputed f32 table for f16 (256 KB)
+static float table_f32_f16[1 << 16];
+
+#if defined(__ARM_NEON) || defined(__wasm_simd128__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+#endif
+
+// note: do not use these inside ggml.c
+// these are meant to be used via the ggml.h API
+float ggml_fp16_to_fp32(ggml_fp16_t x) {
+    return (float) GGML_FP16_TO_FP32(x);
+}
+
+ggml_fp16_t ggml_fp32_to_fp16(float x) {
+    return GGML_FP32_TO_FP16(x);
+}
+
+void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+}
+
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
+    int i = 0;
+#if defined(__F16C__)
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for(; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_FP32_TO_FP16(x[i]);
+    }
+}
+
+//
+// timing
+//
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+static int64_t timer_freq, timer_start;
+void ggml_time_init(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceFrequency(&t);
+    timer_freq = t.QuadPart;
+
+    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
+    // and the uptime is high enough.
+    // We subtract the program start time to reduce the likelihood of that happening.
+    QueryPerformanceCounter(&t);
+    timer_start = t.QuadPart;
+}
+int64_t ggml_time_ms(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
+}
+int64_t ggml_time_us(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
+}
+#else
+void ggml_time_init(void) {}
+int64_t ggml_time_ms(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
+}
+
+int64_t ggml_time_us(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
+}
+#endif
+
+int64_t ggml_cycles(void) {
+    return clock();
+}
+
+int64_t ggml_cycles_per_ms(void) {
+    return CLOCKS_PER_SEC/1000;
+}
+
+#ifdef GGML_PERF
+#define ggml_perf_time_ms()       ggml_time_ms()
+#define ggml_perf_time_us()       ggml_time_us()
+#define ggml_perf_cycles()        ggml_cycles()
+#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
+#else
+#define ggml_perf_time_ms()       0
+#define ggml_perf_time_us()       0
+#define ggml_perf_cycles()        0
+#define ggml_perf_cycles_per_ms() 0
+#endif
+
+//
+// cache line
+//
+
+#if defined(__cpp_lib_hardware_interference_size)
+#define CACHE_LINE_SIZE hardware_destructive_interference_size
+#else
+#if defined(__POWER9_VECTOR__)
+#define CACHE_LINE_SIZE 128
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+
+//
+// quantization
+//
+
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = _mm256_extractf128_ps(x, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if __AVXVNNI__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles( __m256i bytes )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
+    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
+    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
+#else
+    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
+    __m256i high = _mm256_andnot_si256( lowByte, bytes );
+    __m256i low = _mm256_and_si256( lowByte, bytes );
+    high = _mm256_srli_epi16( high, 4 );
+    bytes = _mm256_or_si256( low, high );
+
+    // Compress uint16_t lanes into bytes
+    __m128i r0 = _mm256_castsi256_si128( bytes );
+    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
+    return _mm_packus_epi16( r0, r1 );
+#endif
+}
+#elif defined(__AVX__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return MM256_SET_M128I(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+    __m128i tmph = _mm_srli_epi16(tmpl, 4);
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    tmpl = _mm_and_si128(lowMask, tmpl);
+    tmph = _mm_and_si128(lowMask, tmph);
+    return MM256_SET_M128I(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    const __m128i axl = _mm256_castsi256_si128(ax);
+    const __m128i axh = _mm256_extractf128_si256(ax, 1);
+    const __m128i syl = _mm256_castsi256_si128(sy);
+    const __m128i syh = _mm256_extractf128_si256(sy, 1);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =_mm_hadd_ps(a, b);
+    __m128 res_1 =_mm_hadd_ps(c, d);
+    __m128 res =_mm_hadd_ps(res_0, res_1);
+    res =_mm_hadd_ps(res, res);
+    res =_mm_hadd_ps(res, res);
+
+    return _mm_cvtss_f32(res);
+}
+#endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+#if defined(__ARM_NEON)
+
+#if !defined(__aarch64__)
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+#endif
+#endif
+
+#define QK4_0 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    ggml_fp16_t m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    ggml_fp16_t m;         // min
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    int8_t  qs[QK8_0];     // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+typedef struct {
+    float d;               // delta
+    float s;               // d * sum(qs[i])
+    int8_t  qs[QK8_1];     // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q4_0_reference(x, y, k);
+}
+
+static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+    const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q4_1_reference(x, y, k);
+}
+
+static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(qh));
+    }
+}
+
+static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q5_0_reference(x, y, k);
+}
+
+static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+    const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+    }
+}
+
+static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q5_1_reference(x, y, k);
+}
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = x[i*QK8_0 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = x[i*QK8_0 + j]*id;
+
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
+
+static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+    }
+#else
+    // scalar
+    quantize_row_q8_0_reference(x, y, k);
+#endif
+}
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+    assert(QK8_1 == 32);
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_1; j++) {
+            const float v = x[i*QK8_1 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int sum = 0;
+
+        for (int j = 0; j < QK8_1/2; ++j) {
+            const float v0 = x[i*QK8_1           + j]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
+
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[QK8_1/2 + j] = roundf(v1);
+
+            sum += y[i].qs[          j];
+            sum += y[i].qs[QK8_1/2 + j];
+        }
+
+        y[i].s = sum*d;
+    }
+}
+
+static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int32x4_t accv = vdupq_n_s32(0);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+
+            accv = vaddq_s32(accv, vi);
+        }
+
+        y[i].s = d * vaddvq_s32(accv);
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        v128_t accv = wasm_i32x4_splat(0);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+
+            accv = wasm_i32x4_add(accv, vi);
+        }
+
+        y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
+                      wasm_i32x4_extract_lane(accv, 1) +
+                      wasm_i32x4_extract_lane(accv, 2) +
+                      wasm_i32x4_extract_lane(accv, 3));
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Compute the sum of the quants and set y[i].s
+        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d  = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+
+        // compute sum for y[i].s
+        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
+
+        // set y[i].s
+        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
+        y[i].s = sum*d;
+    }
+#else
+    // scalar
+    quantize_row_q8_1_reference(x, y, k);
+#endif
+}
+
+static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F) - 8;
+            const int x1 = (x[i].qs[j] >>   4) - 8;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F);
+            const int x1 = (x[i].qs[j] >>   4);
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
+            const int x1 = (x[i].qs[j] >>   4) | xh_1;
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) {
+    static const int qk = QK8_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    const block_q8_0 * restrict x = vx;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
+        }
+    }
+}
+
+static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
+static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
+static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_I8] = {
+        .type_name                = "i8",
+        .blck_size                = 1,
+        .type_size                = sizeof(int8_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I16] = {
+        .type_name                = "i16",
+        .blck_size                = 1,
+        .type_size                = sizeof(int16_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I32] = {
+        .type_name                = "i32",
+        .blck_size                = 1,
+        .type_size                = sizeof(int32_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_F32] = {
+        .type_name                = "f32",
+        .blck_size                = 1,
+        .type_size                = sizeof(float),
+        .is_quantized             = false,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+    },
+    [GGML_TYPE_F16] = {
+        .type_name                = "f16",
+        .blck_size                = 1,
+        .type_size                = sizeof(ggml_fp16_t),
+        .is_quantized             = false,
+        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
+        .vec_dot_type             = GGML_TYPE_F16,
+    },
+    [GGML_TYPE_Q4_0] = {
+        .type_name                = "q4_0",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+        .from_float               = quantize_row_q4_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
+        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q4_1] = {
+        .type_name                = "q4_1",
+        .blck_size                = QK4_1,
+        .type_size                = sizeof(block_q4_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
+        .from_float               = quantize_row_q4_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
+        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    },
+    [GGML_TYPE_Q5_0] = {
+        .type_name                = "q5_0",
+        .blck_size                = QK5_0,
+        .type_size                = sizeof(block_q5_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
+        .from_float               = quantize_row_q5_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
+        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q5_1] = {
+        .type_name                = "q5_1",
+        .blck_size                = QK5_1,
+        .type_size                = sizeof(block_q5_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
+        .from_float               = quantize_row_q5_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
+        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    },
+    [GGML_TYPE_Q8_0] = {
+        .type_name                = "q8_0",
+        .blck_size                = QK8_0,
+        .type_size                = sizeof(block_q8_0),
+        .is_quantized             = true,
+        .to_float                 = dequantize_row_q8_0,
+        .from_float               = quantize_row_q8_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
+        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q8_1] = {
+        .type_name                = "q8_1",
+        .blck_size                = QK8_1,
+        .type_size                = sizeof(block_q8_1),
+        .is_quantized             = true,
+        .from_float               = quantize_row_q8_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    },
+#ifdef GGML_USE_K_QUANTS
+    [GGML_TYPE_Q2_K] = {
+        .type_name                = "q2_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q2_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
+        .from_float               = quantize_row_q2_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
+        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    },
+    [GGML_TYPE_Q3_K] = {
+        .type_name                = "q3_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
+        .from_float               = quantize_row_q3_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
+        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    },
+    [GGML_TYPE_Q4_K] = {
+        .type_name                = "q4_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q4_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
+        .from_float               = quantize_row_q4_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
+        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    },
+    [GGML_TYPE_Q5_K] = {
+        .type_name                = "q5_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q5_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
+        .from_float               = quantize_row_q5_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
+        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    },
+    [GGML_TYPE_Q6_K] = {
+        .type_name                = "q6_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q6_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
+        .from_float               = quantize_row_q6_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
+        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    },
+    [GGML_TYPE_Q8_K] = {
+        .type_name                = "q8_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q8_K),
+        .is_quantized             = true,
+        .from_float               = quantize_row_q8_K,
+    }
+#endif
+};
+
+// For internal test use
+ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
+    GGML_ASSERT(type < GGML_TYPE_COUNT);
+    return type_traits[type];
+}
+
+//
+// simd mappings
+//
+
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// GGML_F32_STEP / GGML_F16_STEP
+//   number of elements to process in a single step
+//
+// GGML_F32_EPR / GGML_F16_EPR
+//   number of elements to fit in a single register
+//
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_SIMD
+
+// F32 NEON
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              float32x4_t
+#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
+#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
+#define GGML_F32x4_LOAD         vld1q_f32
+#define GGML_F32x4_STORE        vst1q_f32
+#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
+#define GGML_F32x4_ADD          vaddq_f32
+#define GGML_F32x4_MUL          vmulq_f32
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
+#define GGML_F32x4_REDUCE(res, x)              \
+{                                              \
+    int offset = GGML_F32_ARR >> 1;            \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vaddq_f32(x[i], x[offset+i]);   \
+    }                                          \
+    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    #define GGML_F16_STEP 32
+    #define GGML_F16_EPR  8
+
+    #define GGML_F16x8              float16x8_t
+    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
+    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
+    #define GGML_F16x8_LOAD         vld1q_f16
+    #define GGML_F16x8_STORE        vst1q_f16
+    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+    #define GGML_F16x8_ADD          vaddq_f16
+    #define GGML_F16x8_MUL          vmulq_f16
+    #define GGML_F16x8_REDUCE(res, x)                             \
+    do {                                                          \
+        int offset = GGML_F16_ARR >> 1;                           \
+        for (int i = 0; i < offset; ++i) {                        \
+            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+        }                                                         \
+        offset >>= 1;                                             \
+        for (int i = 0; i < offset; ++i) {                        \
+            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+        }                                                         \
+        offset >>= 1;                                             \
+        for (int i = 0; i < offset; ++i) {                        \
+            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
+        }                                                         \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
+        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    } while (0)
+
+    #define GGML_F16_VEC                GGML_F16x8
+    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
+    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
+    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
+    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
+#else
+    // if FP16 vector arithmetic is not supported, we use FP32 instead
+    // and take advantage of the vcvt_ functions to convert to/from FP16
+
+    #define GGML_F16_STEP 16
+    #define GGML_F16_EPR  4
+
+    #define GGML_F32Cx4              float32x4_t
+    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
+    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
+    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
+    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
+    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+    #define GGML_F32Cx4_ADD          vaddq_f32
+    #define GGML_F32Cx4_MUL          vmulq_f32
+    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
+
+    #define GGML_F16_VEC                GGML_F32Cx4
+    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
+    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
+    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
+    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
+#endif
+
+#elif defined(__AVX__)
+
+#define GGML_SIMD
+
+// F32 AVX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  8
+
+#define GGML_F32x8         __m256
+#define GGML_F32x8_ZERO    _mm256_setzero_ps()
+#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
+#define GGML_F32x8_LOAD    _mm256_loadu_ps
+#define GGML_F32x8_STORE   _mm256_storeu_ps
+#if defined(__FMA__)
+    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define GGML_F32x8_ADD     _mm256_add_ps
+#define GGML_F32x8_MUL     _mm256_mul_ps
+#define GGML_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
+                                 _mm256_extractf128_ps(x[0], 1)); \
+    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
+    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x8
+#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 AVX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define GGML_F32Cx8             __m256
+#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
+#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
+
+#if defined(__F16C__)
+// the  _mm256_cvt intrinsics require F16C
+#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
+#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
+#else
+static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+    float arr[8];
+
+    _mm256_storeu_ps(arr, y);
+
+    for (int i = 0; i < 8; i++)
+        x[i] = GGML_FP32_TO_FP16(arr[i]);
+}
+#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
+#endif
+
+#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD         _mm256_add_ps
+#define GGML_F32Cx8_MUL         _mm256_mul_ps
+#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC                GGML_F32Cx8
+#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_SIMD
+
+// F32 POWER9
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              vector float
+#define GGML_F32x4_ZERO         0.0f
+#define GGML_F32x4_SET1         vec_splats
+#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD          vec_add
+#define GGML_F32x4_MUL          vec_mul
+#define GGML_F32x4_REDUCE(res, x)              \
+{                                              \
+    int offset = GGML_F32_ARR >> 1;            \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    res = vec_extract(x[0], 0) +               \
+          vec_extract(x[0], 1) +               \
+          vec_extract(x[0], 2) +               \
+          vec_extract(x[0], 3);                \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 POWER9
+#define GGML_F16_STEP       GGML_F32_STEP
+#define GGML_F16_EPR        GGML_F32_EPR
+#define GGML_F16_VEC        GGML_F32x4
+#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
+  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
+  vec_extract_fp32_from_shortl(vec_xl(0, p))
+#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
+#define GGML_F16_VEC_STORE(p, r, i)                             \
+  if (i & 0x1)                                                  \
+    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
+                                   r[i - GGML_ENDIAN_BYTE(0)]), \
+            0, p - GGML_F16_EPR)
+
+#elif defined(__wasm_simd128__)
+
+#define GGML_SIMD
+
+// F32 WASM
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              v128_t
+#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
+#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
+#define GGML_F32x4_LOAD         wasm_v128_load
+#define GGML_F32x4_STORE        wasm_v128_store
+#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
+#define GGML_F32x4_ADD          wasm_f32x4_add
+#define GGML_F32x4_MUL          wasm_f32x4_mul
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 WASM
+
+#define GGML_F16_STEP 16
+#define GGML_F16_EPR  4
+
+inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(p[0]);
+    tmp[1] = GGML_FP16_TO_FP32(p[1]);
+    tmp[2] = GGML_FP16_TO_FP32(p[2]);
+    tmp[3] = GGML_FP16_TO_FP32(p[3]);
+
+    return wasm_v128_load(tmp);
+}
+
+inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
+    float tmp[4];
+
+    wasm_v128_store(tmp, x);
+
+    p[0] = GGML_FP32_TO_FP16(tmp[0]);
+    p[1] = GGML_FP32_TO_FP16(tmp[1]);
+    p[2] = GGML_FP32_TO_FP16(tmp[2]);
+    p[3] = GGML_FP32_TO_FP16(tmp[3]);
+}
+
+#define GGML_F16x4             v128_t
+#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
+#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
+#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
+#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
+#define GGML_F16x4_FMA         GGML_F32x4_FMA
+#define GGML_F16x4_ADD         wasm_f32x4_add
+#define GGML_F16x4_MUL         wasm_f32x4_mul
+#define GGML_F16x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F16_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_F16_VEC                GGML_F16x4
+#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
+#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
+#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
+#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
+
+#elif defined(__SSE3__)
+
+#define GGML_SIMD
+
+// F32 SSE
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4         __m128
+#define GGML_F32x4_ZERO    _mm_setzero_ps()
+#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32x4_LOAD    _mm_loadu_ps
+#define GGML_F32x4_STORE   _mm_storeu_ps
+#if defined(__FMA__)
+    // TODO: Does this work?
+    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define GGML_F32x4_ADD     _mm_add_ps
+#define GGML_F32x4_MUL     _mm_mul_ps
+#define GGML_F32x4_REDUCE(res, x)                                 \
+{                                                                 \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
+    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));                     \
+}
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 SSE
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  4
+
+static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+    return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
+    float arr[4];
+
+    _mm_storeu_ps(arr, y);
+
+    x[0] = GGML_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4             __m128
+#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
+#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
+#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD         _mm_add_ps
+#define GGML_F32Cx4_MUL         _mm_mul_ps
+#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC                 GGML_F32Cx4
+#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+
+#endif
+
+// GGML_F32_ARR / GGML_F16_ARR
+//   number of registers to use per step
+#ifdef GGML_SIMD
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
+#endif
+
+//
+// fundamental operations
+//
+
+inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
+inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
+inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
+inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
+inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
+inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
+inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
+inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
+
+static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+#ifdef GGML_SIMD
+    float sumf = 0.0f;
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F32_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += x[i]*y[i];
+    }
+#else
+    // scalar
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(x[i]*y[i]);
+    }
+#endif
+
+    *s = sumf;
+}
+
+static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+    ggml_float sumf = 0.0;
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F16_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#endif
+
+    *s = sumf;
+}
+
+static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q4_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
+    for (int i = 0; i < nb; i += 2) {
+        const block_q4_0 * restrict x0 = &x[i + 0];
+        const block_q4_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = _mm256_set1_epi8( 8 );
+        bx = _mm256_sub_epi8( bx, off );
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+
+        const __m128i lowMask = _mm_set1_epi8(0xF);
+        const __m128i off = _mm_set1_epi8(8);
+
+        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx = _mm_and_si128(lowMask, tmp);
+        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+
+        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+
+        // Convert int32_t to float
+        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
+
+        // Apply the scale, and accumulate
+        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    // First round without accumulation
+    {
+        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        acc_0 = _mm_mul_ps( d_0_1, p0 );
+        acc_1 = _mm_mul_ps( d_0_1, p1 );
+        acc_2 = _mm_mul_ps( d_2_3, p2 );
+        acc_3 = _mm_mul_ps( d_2_3, p3 );
+    }
+
+    // Main loop
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
+    for (int i = 2; i < nb; i+=2) {
+        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        // subtract offset
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F) - 8;
+            const int v1 = (x[i].qs[j] >>   4) - 8;
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q4_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+    // TODO: add WASM SIMD
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs = 0;
+
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
+    for (int i = 0; i < nb; i += 2) {
+        const block_q4_1 * restrict x0 = &x[i + 0];
+        const block_q4_1 * restrict x1 = &x[i + 1];
+        const block_q8_1 * restrict y0 = &y[i + 0];
+        const block_q8_1 * restrict y1 = &y[i + 1];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+        const float d1 = y[i].d;
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        const __m256 d0v = _mm256_set1_ps( d0 );
+        const __m256 d1v = _mm256_set1_ps( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
+
+        const __m256 xy = mul_sum_us8_pairs_float(bx, by);
+
+        // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F);
+            const int v1 = (x[i].qs[j] >>   4);
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+
+    const block_q5_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
+    for (int i = 0; i < nb; i += 2) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q5_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q8_0 * restrict y0 = &y[i];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = MM256_SET_M128I(bxh, bxl);
+
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // These tempory registers are for masking and shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
+
+    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
+    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (int i = 0; i < nb; i++) {
+        memcpy(&qh, x[i].qh, sizeof(uint32_t));
+
+        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+
+        // ((qh & (1u << (j + 16))) >> (j + 12));
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
+        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
+    for (int i = 0; i < nb; i += 2) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q5_1 * restrict x1 = &x[i + 1];
+        const block_q8_1 * restrict y0 = &y[i];
+        const block_q8_1 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
+        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q8_1 * restrict y0 = &y[i];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv,
+                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        const __m256 dy = _mm256_set1_ps(y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = MM256_SET_M128I(bxh, bxl);
+
+        const __m256 dy = _mm256_set1_ps(y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // temporary registers for shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (int i = 0; i < nb; i++) {
+        memcpy(&qh, x[i].qh, sizeof(uint32_t));
+
+        // load qh
+        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
+
+        // ((qh >> (j +  0)) << 4) & 0x10;
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
+
+        // ((qh >> (j + 12))     ) & 0x10;
+        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q8_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
+    for (int i = 0; i < nb; i += 2) {
+        const block_q8_0 * restrict x0 = &x[i + 0];
+        const block_q8_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+
+#else
+        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
+        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
+        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
+        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
+
+        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
+        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
+        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
+        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
+
+        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
+        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
+        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
+        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d, q, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+    size_t vl = __riscv_vsetvl_e8m1(qk);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
+        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
+
+        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
+
+        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[i].qs[j]*y[i].qs[j];
+        }
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
+    }
+
+    *s = sumf;
+#endif
+}
+
+// compute GGML_VEC_DOT_UNROLL dot products at once
+// xs - x row stride in bytes
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
+
+    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
+
+                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+            }
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#endif
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        s[i] = sumf[i];
+    }
+}
+
+inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#endif
+}
+
+// xs and vs are byte strides of x and v
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+
+    const float * restrict x[GGML_VEC_MAD_UNROLL];
+    const float * restrict v[GGML_VEC_MAD_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
+        x[i] = (const float *) ((const char *) xv + i*xs);
+        v[i] = (const float *) ((const char *) vv + i*vs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
+
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
+    }
+
+    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+            }
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = np; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#else
+    // scalar
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = 0; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#endif
+}
+
+//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
+inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmul(y, 1, &v, y, 1, n);
+#elif defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] *= v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] *= v;
+    }
+#endif
+}
+
+inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s);   }
+inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
+inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
+inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
+inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
+inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
+inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
+inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
+inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
+inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
+
+static const float GELU_COEF_A     = 0.044715f;
+static const float GELU_QUICK_COEF = -1.702f;
+static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+
+inline static float ggml_gelu_f32(float x) {
+    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = table_gelu_f16[i16[i]];
+    }
+}
+
+#ifdef GGML_GELU_FP16
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
+    }
+}
+#else
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_f32(x[i]);
+    }
+}
+#endif
+
+inline static float ggml_gelu_quick_f32(float x) {
+    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
+}
+
+//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+//    const uint16_t * i16 = (const uint16_t *) x;
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = table_gelu_quick_f16[i16[i]];
+//    }
+//}
+
+#ifdef GGML_GELU_QUICK_FP16
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
+    }
+}
+#else
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_quick_f32(x[i]);
+    }
+}
+#endif
+
+// Sigmoid Linear Unit (SiLU) function
+inline static float ggml_silu_f32(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+//    const uint16_t * i16 = (const uint16_t *) x;
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = table_silu_f16[i16[i]];
+//    }
+//}
+
+#ifdef GGML_SILU_FP16
+inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]);
+    }
+}
+#else
+inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_silu_f32(x[i]);
+    }
+}
+#endif
+
+inline static float ggml_silu_backward_f32(float x, float dy) {
+    const float s = 1.0f/(1.0f + expf(-x));
+    return dy*s*(1.0f + x*(1.0f - s));
+}
+
+#ifdef GGML_SILU_FP16
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        // we did not use x[i] to compute forward silu but its f16 equivalent
+        // take derivative at f16 of x[i]:
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        float usedx = GGML_FP16_TO_FP32(fp16);
+        dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
+    }
+}
+#else
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
+    }
+}
+#endif
+
+inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+#else
+    vDSP_sve(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_FP16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    float max = -INFINITY;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+    }
+    *s = max;
+#else
+    vDSP_maxv(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
+    ggml_vec_norm_f32(n, s, x);
+    *s = 1.f/(*s);
+}
+
+inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
+    float max = -INFINITY;
+    int idx = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+        if (max == x[i]) { idx = i; }
+    }
+    *s = idx;
+}
+
+//
+// data types
+//
+
+static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+    "NONE",
+
+    "DUP",
+    "ADD",
+    "ADD1",
+    "ACC",
+    "SUB",
+    "MUL",
+    "DIV",
+    "SQR",
+    "SQRT",
+    "LOG",
+    "SUM",
+    "SUM_ROWS",
+    "MEAN",
+    "ARGMAX",
+    "REPEAT",
+    "REPEAT_BACK",
+    "CONCAT",
+    "SILU_BACK",
+    "NORM",
+    "RMS_NORM",
+    "RMS_NORM_BACK",
+    "GROUP_NORM",
+
+    "MUL_MAT",
+    "OUT_PROD",
+
+    "SCALE",
+    "SET",
+    "CPY",
+    "CONT",
+    "RESHAPE",
+    "VIEW",
+    "PERMUTE",
+    "TRANSPOSE",
+    "GET_ROWS",
+    "GET_ROWS_BACK",
+    "DIAG",
+    "DIAG_MASK_INF",
+    "DIAG_MASK_ZERO",
+    "SOFT_MAX",
+    "SOFT_MAX_BACK",
+    "ROPE",
+    "ROPE_BACK",
+    "ALIBI",
+    "CLAMP",
+    "CONV_1D",
+    "CONV_1D_STAGE_0",
+    "CONV_1D_STAGE_1",
+    "CONV_TRANSPOSE_1D",
+    "CONV_2D",
+    "CONV_2D_STAGE_0",
+    "CONV_2D_STAGE_1",
+    "CONV_TRANSPOSE_2D",
+    "POOL_1D",
+    "POOL_2D",
+    "UPSCALE",
+
+    "FLASH_ATTN",
+    "FLASH_FF",
+    "FLASH_ATTN_BACK",
+    "WIN_PART",
+    "WIN_UNPART",
+    "GET_REL_POS",
+    "ADD_REL_POS",
+
+    "UNARY",
+
+    "MAP_UNARY",
+    "MAP_BINARY",
+
+    "MAP_CUSTOM1_F32",
+    "MAP_CUSTOM2_F32",
+    "MAP_CUSTOM3_F32",
+
+    "MAP_CUSTOM1",
+    "MAP_CUSTOM2",
+    "MAP_CUSTOM3",
+
+    "CROSS_ENTROPY_LOSS",
+    "CROSS_ENTROPY_LOSS_BACK",
+};
+
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+
+static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+    "none",
+
+    "x",
+    "x+y",
+    "x+y",
+    "view(x,nb,offset)+=y->x",
+    "x-y",
+    "x*y",
+    "x/y",
+    "x^2",
+    "√x",
+    "log(x)",
+    "Σx",
+    "Σx_k",
+    "Σx/n",
+    "argmax(x)",
+    "repeat(x)",
+    "repeat_back(x)",
+    "concat(x, y)",
+    "silu_back(x)",
+    "norm(x)",
+    "rms_norm(x)",
+    "rms_norm_back(x)",
+    "group_norm(x)",
+
+    "X*Y",
+    "X*Y",
+
+    "x*v",
+    "y-\\>view(x)",
+    "x-\\>y",
+    "cont(x)",
+    "reshape(x)",
+    "view(x)",
+    "permute(x)",
+    "transpose(x)",
+    "get_rows(x)",
+    "get_rows_back(x)",
+    "diag(x)",
+    "diag_mask_inf(x)",
+    "diag_mask_zero(x)",
+    "soft_max(x)",
+    "soft_max_back(x)",
+    "rope(x)",
+    "rope_back(x)",
+    "alibi(x)",
+    "clamp(x)",
+    "conv_1d(x)",
+    "conv_1d_stage_0(x)",
+    "conv_1d_stage_1(x)",
+    "conv_transpose_1d(x)",
+    "conv_2d(x)",
+    "conv_2d_stage_0(x)",
+    "conv_2d_stage_1(x)",
+    "conv_transpose_2d(x)",
+    "pool_1d(x)",
+    "pool_2d(x)",
+    "upscale(x)",
+
+    "flash_attn(x)",
+    "flash_ff(x)",
+    "flash_attn_back(x)",
+    "win_part(x)",
+    "win_unpart(x)",
+    "get_rel_pos(x)",
+    "add_rel_pos(x)",
+
+    "unary(x)",
+
+    "f(x)",
+    "f(x,y)",
+
+    "custom_f32(x)",
+    "custom_f32(x,y)",
+    "custom_f32(x,y,z)",
+
+    "custom(x)",
+    "custom(x,y)",
+    "custom(x,y,z)",
+
+    "cross_entropy_loss(x,y)",
+    "cross_entropy_loss_back(x,y)",
+};
+
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+
+static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
+
+static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
+static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
+
+// WARN:
+// Mis-confguration can lead to problem that's hard to reason about:
+// * At best  it crash or talks nosense.
+// * At worst it talks slightly difference but hard to perceive.
+//
+// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
+// Take care about compile options (e.g., GGML_USE_xxx).
+static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
+static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
+
+static void ggml_setup_op_has_task_pass(void) {
+    {   // INIT
+        bool * p = GGML_OP_HAS_INIT;
+
+        p[GGML_OP_ACC                    ] = true;
+        p[GGML_OP_MUL_MAT                ] = true;
+        p[GGML_OP_OUT_PROD               ] = true;
+        p[GGML_OP_SET                    ] = true;
+        p[GGML_OP_GET_ROWS_BACK          ] = true;
+        p[GGML_OP_DIAG_MASK_INF          ] = true;
+        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
+        p[GGML_OP_CONV_1D                ] = true;
+        p[GGML_OP_CONV_1D_STAGE_0        ] = true;
+        p[GGML_OP_CONV_1D_STAGE_1        ] = true;
+        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
+        p[GGML_OP_CONV_2D                ] = true;
+        p[GGML_OP_CONV_2D_STAGE_0        ] = true;
+        p[GGML_OP_CONV_2D_STAGE_1        ] = true;
+        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
+        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
+        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
+        p[GGML_OP_ADD_REL_POS            ] = true;
+    }
+
+    {   // FINALIZE
+        bool * p = GGML_OP_HAS_FINALIZE;
+
+        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
+    }
+}
+
+//
+// ggml context
+//
+
+struct ggml_context {
+    size_t mem_size;
+    void * mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
+
+    bool   dynamic;
+    bool   dynamic_save;
+
+    int    n_objects;
+
+    struct ggml_object * objects_begin;
+    struct ggml_object * objects_end;
+
+    struct ggml_scratch scratch;
+    struct ggml_scratch scratch_save;
+};
+
+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+};
+
+//
+// NUMA support
+//
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+//
+// ggml state
+//
+
+struct ggml_state {
+    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
+};
+
+// global state
+static struct ggml_state g_state;
+static atomic_int g_state_barrier = 0;
+
+// barrier via spin lock
+inline static void ggml_critical_section_start(void) {
+    int processing = atomic_fetch_add(&g_state_barrier, 1);
+
+    while (processing > 0) {
+        // wait for other threads to finish
+        atomic_fetch_sub(&g_state_barrier, 1);
+        sched_yield(); // TODO: reconsider this
+        processing = atomic_fetch_add(&g_state_barrier, 1);
+    }
+}
+
+// TODO: make this somehow automatically executed
+//       some sort of "sentry" mechanism
+inline static void ggml_critical_section_end(void) {
+    atomic_fetch_sub(&g_state_barrier, 1);
+}
+
+void ggml_numa_init(void) {
+    if (g_state.numa.n_nodes > 0) {
+        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
+
+        return;
+    }
+
+#ifdef __linux__
+    struct stat st;
+    char path[256];
+    int rv;
+
+    // enumerate nodes
+    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.n_nodes;
+    }
+
+    // enumerate CPUs
+    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.total_cpus;
+    }
+
+    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
+    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
+        g_state.numa.n_nodes = 0;
+        return;
+    }
+
+    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
+        struct ggml_numa_node * node = &g_state.numa.nodes[n];
+        GGML_PRINT_DEBUG("CPUs on node %u:", n);
+        node->n_cpus = 0;
+        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
+            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+            if (stat(path, &st) == 0) {
+                node->cpus[node->n_cpus++] = c;
+                GGML_PRINT_DEBUG(" %u", c);
+            }
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+
+    if (ggml_is_numa()) {
+        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+        if (fptr != NULL) {
+            char buf[42];
+            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    // TODO
+#endif
+}
+
+bool ggml_is_numa(void) {
+    return g_state.numa.n_nodes > 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_print_object(const struct ggml_object * obj) {
+    GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
+            obj->type, obj->offs, obj->size, (const void *) obj->next);
+}
+
+void ggml_print_objects(const struct ggml_context * ctx) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
+
+    while (obj != NULL) {
+        ggml_print_object(obj);
+        obj = obj->next;
+    }
+
+    GGML_PRINT("%s: --- end ---\n", __func__);
+}
+
+int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+int64_t ggml_nrows(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+    size_t nbytes;
+    size_t blck_size = ggml_blck_size(tensor->type);
+    if (blck_size == 1) {
+        nbytes = ggml_type_size(tensor->type);
+        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+    else {
+        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
+        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+
+    return nbytes;
+}
+
+size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
+    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
+}
+
+size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
+}
+
+int ggml_blck_size(enum ggml_type type) {
+    return type_traits[type].blck_size;
+}
+
+size_t ggml_type_size(enum ggml_type type) {
+    return type_traits[type].type_size;
+}
+
+float ggml_type_sizef(enum ggml_type type) {
+    return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
+}
+
+const char * ggml_type_name(enum ggml_type type) {
+    return type_traits[type].type_name;
+}
+
+bool ggml_is_quantized(enum ggml_type type) {
+    return type_traits[type].is_quantized;
+}
+
+const char * ggml_op_name(enum ggml_op op) {
+    return GGML_OP_NAME[op];
+}
+
+const char * ggml_op_symbol(enum ggml_op op) {
+    return GGML_OP_SYMBOL[op];
+}
+
+size_t ggml_element_size(const struct ggml_tensor * tensor) {
+    return ggml_type_size(tensor->type);
+}
+
+static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[0]           == t1->ne[0])  &&
+           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
+static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[1] == t1->ne[1])   &&
+           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
+enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
+    enum ggml_type wtype = GGML_TYPE_COUNT;
+
+    switch (ftype) {
+        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
+        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
+        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
+        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
+        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
+        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
+        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
+        case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
+        case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
+        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
+        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
+        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
+        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
+    }
+
+    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
+
+    return wtype;
+}
+
+size_t ggml_tensor_overhead(void) {
+    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
+}
+
+bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+    return tensor->nb[0] > tensor->nb[1];
+}
+
+bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+bool ggml_is_permuted(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
+}
+
+static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t0->ne[0] == t1->ne[0] ) &&
+        (t0->ne[1] == t1->ne[1] ) &&
+        (t0->ne[2] == t1->ne[2] ) &&
+        (t0->ne[3] == t1->ne[3] );
+}
+
+// check if t1 can be represented as a repeatition of t0
+static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t1->ne[0]%t0->ne[0] == 0) &&
+        (t1->ne[1]%t0->ne[1] == 0) &&
+        (t1->ne[2]%t0->ne[2] == 0) &&
+        (t1->ne[3]%t0->ne[3] == 0);
+}
+
+static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
+}
+
+static inline int ggml_up32(int n) {
+    return (n + 31) & ~31;
+}
+
+//static inline int ggml_up64(int n) {
+//    return (n + 63) & ~63;
+//}
+
+static inline int ggml_up(int n, int m) {
+    // assert m is a power of 2
+    GGML_ASSERT((m & (m - 1)) == 0);
+    return (n + m - 1) & ~(m - 1);
+}
+
+// assert that pointer is aligned to GGML_MEM_ALIGN
+#define ggml_assert_aligned(ptr) \
+    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ggml_context * ggml_init(struct ggml_init_params params) {
+    // make this function thread safe
+    ggml_critical_section_start();
+
+    static bool is_first_call = true;
+
+    if (is_first_call) {
+        // initialize time system (required on Windows)
+        ggml_time_init();
+
+        // initialize GELU, Quick GELU, SILU and EXP F32 tables
+        {
+            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+
+            ggml_fp16_t ii;
+            for (int i = 0; i < (1 << 16); ++i) {
+                uint16_t ui = i;
+                memcpy(&ii, &ui, sizeof(ii));
+                const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
+                table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+                table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
+                table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
+                table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
+            }
+
+            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+
+            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+        }
+
+        // initialize g_state
+        {
+            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+
+            g_state = (struct ggml_state) {
+                /*.contexts =*/ { { 0 } },
+                /*.numa =*/ {
+                    .n_nodes = 0,
+                    .total_cpus = 0,
+                },
+            };
+
+            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
+                g_state.contexts[i].used = false;
+            }
+
+            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+
+            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+        }
+
+#if defined(GGML_USE_CUBLAS)
+        ggml_init_cublas();
+#elif defined(GGML_USE_CLBLAST)
+        ggml_cl_init();
+#endif
+
+        ggml_setup_op_has_task_pass();
+
+        is_first_call = false;
+    }
+
+    // find non-used context in g_state
+    struct ggml_context * ctx = NULL;
+
+    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
+        if (!g_state.contexts[i].used) {
+            g_state.contexts[i].used = true;
+            ctx = &g_state.contexts[i].context;
+
+            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
+            break;
+        }
+    }
+
+    if (ctx == NULL) {
+        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
+
+        ggml_critical_section_end();
+
+        return NULL;
+    }
+
+    // allow to call ggml_init with 0 size
+    if (params.mem_size == 0) {
+        params.mem_size = GGML_MEM_ALIGN;
+    }
+
+    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
+
+    *ctx = (struct ggml_context) {
+        /*.mem_size           =*/ mem_size,
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
+        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
+        /*.no_alloc           =*/ params.no_alloc,
+        /*.no_alloc_save      =*/ params.no_alloc,
+        /*.dynamic            =*/ params.dynamic,
+        /*.dynamic_save       =*/ params.dynamic,
+        /*.n_objects          =*/ 0,
+        /*.objects_begin      =*/ NULL,
+        /*.objects_end        =*/ NULL,
+        /*.scratch            =*/ { 0, 0, NULL, },
+        /*.scratch_save       =*/ { 0, 0, NULL, },
+    };
+
+    GGML_ASSERT(ctx->mem_buffer != NULL);
+
+    ggml_assert_aligned(ctx->mem_buffer);
+
+    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
+
+    ggml_critical_section_end();
+
+    return ctx;
+}
+
+void ggml_free(struct ggml_context * ctx) {
+    // make this function thread safe
+    ggml_critical_section_start();
+
+    struct ggml_object* obj = ctx->objects_begin;
+
+    while (obj != NULL) {
+        if (obj->type != GGML_OBJECT_TENSOR) {
+            obj = obj->next;
+            continue;
+        }
+        struct ggml_tensor* tensor = (struct ggml_tensor*)((char*)ctx->mem_buffer + obj->offs);
+        if (tensor->not_own_data) {
+            obj = obj->next;
+            continue;
+        }
+
+        if (tensor->dynamic && tensor->data != NULL) {
+            GGML_DYNAMIC_FREE(tensor->data);
+            tensor->data = NULL;
+        }
+
+        obj = obj->next;
+    }
+
+    bool found = false;
+
+    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
+        if (&g_state.contexts[i].context == ctx) {
+            g_state.contexts[i].used = false;
+
+            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
+                    __func__, i, ggml_used_mem(ctx));
+
+            if (ctx->mem_buffer_owned) {
+                GGML_ALIGNED_FREE(ctx->mem_buffer);
+            }
+
+            found = true;
+            break;
+        }
+    }
+
+    if (!found) {
+        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
+    }
+
+    ggml_critical_section_end();
+}
+
+size_t ggml_used_mem(const struct ggml_context * ctx) {
+    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
+}
+
+GGML_API size_t ggml_used_mem_of_data(const struct ggml_context* ctx) {
+    size_t mem_size = 0;
+
+    struct ggml_object* obj = ctx->objects_begin;
+
+    while (obj != NULL) {
+        struct ggml_tensor* tensor = (struct ggml_tensor*)((char*)ctx->mem_buffer + obj->offs);
+        if (tensor->not_own_data || tensor->dynamic) {
+            obj = obj->next;
+            continue;
+        }
+
+        const size_t size = ggml_nbytes(tensor);
+
+        mem_size += size;
+
+        obj = obj->next;
+    }
+
+    return mem_size;
+}
+
+size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
+    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
+
+    ctx->scratch = scratch;
+
+    return result;
+}
+
+bool ggml_get_no_alloc(struct ggml_context * ctx) {
+    return ctx->no_alloc;
+}
+
+void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
+    ctx->no_alloc = no_alloc;
+}
+
+void ggml_set_dynamic(struct ggml_context * ctx, bool dynamic) {
+    ctx->dynamic = dynamic;
+}
+
+bool ggml_get_dynamic(struct ggml_context* ctx) {
+    return ctx->dynamic;
+}
+
+void ggml_hold_dynamic_tensor(struct ggml_tensor * tensor) {
+    tensor->dynamic_hold = true;
+}
+
+void ggml_free_dynamic_tensor(struct ggml_tensor * tensor) {
+    GGML_DYNAMIC_FREE(tensor->data);
+    tensor->dynamic_hold = false;
+}
+
+void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
+    return ctx->mem_buffer;
+}
+
+size_t ggml_get_mem_size(const struct ggml_context * ctx) {
+    return ctx->mem_size;
+}
+
+size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
+    size_t max_size = 0;
+
+    struct ggml_object * obj = ctx->objects_begin;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TENSOR) {
+            struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
+
+            const size_t size = ggml_nbytes(tensor);
+
+            if (max_size < size) {
+                max_size = size;
+            }
+        }
+
+        obj = obj->next;
+    }
+
+    return max_size;
+}
+
+// IMPORTANT:
+// when creating "opt" tensors, always save and load the scratch buffer
+// this is an error prone process, but it is necessary to support inplace
+// operators when using scratch buffers
+// TODO: implement a better way
+static void ggml_scratch_save(struct ggml_context * ctx) {
+    // this is needed to allow opt tensors to store their data
+    // TODO: again, need to find a better way
+    ctx->no_alloc_save = ctx->no_alloc;
+    ctx->no_alloc      = false;
+
+    ctx->dynamic_save  = ctx->dynamic;
+    ctx->dynamic       = false;
+
+    ctx->scratch_save = ctx->scratch;
+    ctx->scratch.data = NULL;
+}
+
+static void ggml_scratch_load(struct ggml_context * ctx) {
+    ctx->no_alloc = ctx->no_alloc_save;
+
+    ctx->dynamic  = ctx->dynamic_save;
+
+    ctx->scratch = ctx->scratch_save;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
+    // always insert objects at the end of the context's memory pool
+    struct ggml_object * obj_cur = ctx->objects_end;
+
+    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
+    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
+    const size_t cur_end  = cur_offs + cur_size;
+
+    // align to GGML_MEM_ALIGN
+    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
+
+    char * const mem_buffer = ctx->mem_buffer;
+    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
+
+    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
+        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+                __func__, cur_end + size_needed, ctx->mem_size);
+        assert(false);
+        return NULL;
+    }
+
+    *obj_new = (struct ggml_object) {
+        .offs = cur_end + GGML_OBJECT_SIZE,
+        .size = size_needed,
+        .next = NULL,
+        .type = type,
+    };
+
+    ggml_assert_aligned(mem_buffer + obj_new->offs);
+
+    if (obj_cur != NULL) {
+        obj_cur->next = obj_new;
+    } else {
+        // this is the first object in this context
+        ctx->objects_begin = obj_new;
+    }
+
+    ctx->objects_end = obj_new;
+
+    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
+
+    return obj_new;
+}
+
+static struct ggml_tensor * ggml_new_tensor_impl(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int                   n_dims,
+        const int64_t       * ne,
+        struct ggml_tensor  * view_src,
+        size_t                view_offs) {
+
+    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
+
+    // find the base tensor and absolute offset
+    if (view_src != NULL && view_src->view_src != NULL) {
+        view_offs += view_src->view_offs;
+        view_src   = view_src->view_src;
+    }
+
+    size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= ne[i];
+    }
+
+    GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
+
+    void * data = view_src != NULL ? view_src->data : NULL;
+    if (data != NULL) {
+        data = (char *) data + view_offs;
+    }
+
+    size_t obj_alloc_size = 0;
+
+    if (view_src == NULL && !ctx->no_alloc && !ctx->dynamic) {
+        if (ctx->scratch.data != NULL) {
+            // allocate tensor data in the scratch buffer
+            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
+                GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+                        __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
+                assert(false);
+                return NULL;
+            }
+
+            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+
+            ctx->scratch.offs += data_size;
+        } else {
+            // allocate tensor data in the context's memory pool
+            obj_alloc_size = data_size;
+        }
+    }
+
+    struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
+
+    // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
+
+    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
+
+    *result = (struct ggml_tensor) {
+        /*.type         =*/ type,
+        /*.backend      =*/ GGML_BACKEND_CPU,
+        /*.buffer       =*/ NULL,
+        /*.n_dims       =*/ n_dims,
+        /*.ne           =*/ { 1, 1, 1, 1 },
+        /*.nb           =*/ { 0, 0, 0, 0 },
+        /*.op           =*/ GGML_OP_NONE,
+        /*.op_params    =*/ { 0 },
+        /*.is_param     =*/ false,
+        /*.not_own_data =*/ false,
+        /*.dynamic      =*/ ctx->dynamic,
+        /*.dinamic_hold =*/ false,
+        /*.n_dst        =*/ 0,
+        /*.n_dst_curr   =*/ 0,
+        /*.grad         =*/ NULL,
+        /*.src          =*/ { NULL },
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+        /*.view_src     =*/ view_src,
+        /*.view_offs    =*/ view_offs,
+        /*.data         =*/ (obj_alloc_size > 0 && !ctx->no_alloc && !ctx->dynamic) ? (void *)(result + 1) : data,
+        /*.name         =*/ { 0 },
+        /*.extra        =*/ NULL,
+        /*.padding      =*/ { 0 },
+    };
+
+    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
+    //ggml_assert_aligned(result->data);
+
+    for (int i = 0; i < n_dims; i++) {
+        result->ne[i] = ne[i];
+    }
+
+    result->nb[0] = ggml_type_size(type);
+    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
+    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
+    }
+
+    ctx->n_objects++;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_new_tensor(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int                   n_dims,
+        const int64_t       * ne) {
+    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
+}
+
+struct ggml_tensor * ggml_new_tensor_1d(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int64_t ne0) {
+    return ggml_new_tensor(ctx, type, 1, &ne0);
+}
+
+struct ggml_tensor * ggml_new_tensor_2d(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int64_t ne0,
+        int64_t ne1) {
+    const int64_t ne[2] = { ne0, ne1 };
+    return ggml_new_tensor(ctx, type, 2, ne);
+}
+
+struct ggml_tensor * ggml_new_tensor_3d(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2) {
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+    return ggml_new_tensor(ctx, type, 3, ne);
+}
+
+struct ggml_tensor * ggml_new_tensor_4d(
+        struct ggml_context * ctx,
+        enum   ggml_type type,
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3) {
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+    return ggml_new_tensor(ctx, type, 4, ne);
+}
+
+struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+
+    ggml_scratch_load(ctx);
+
+    ggml_set_i32(result, value);
+
+    return result;
+}
+
+struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+
+    ggml_scratch_load(ctx);
+
+    ggml_set_f32(result, value);
+
+    return result;
+}
+
+struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
+    return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
+}
+
+static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
+    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
+    assert(params_size <= GGML_MAX_OP_PARAMS);
+    memcpy(tensor->op_params, params, params_size);
+}
+
+static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    return ((const int32_t *)(tensor->op_params))[i];
+}
+
+static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    ((int32_t *)(tensor->op_params))[i] = value;
+}
+
+struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
+    memset(tensor->data, 0, ggml_nbytes(tensor));
+    return tensor;
+}
+
+struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
+    const int n     = ggml_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+
+    return tensor;
+}
+
+struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
+    const int n     = ggml_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+
+    return tensor;
+}
+
+void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne0 = tensor->ne[0];
+
+    const int64_t i3_ = (i/(ne2*ne1*ne0));
+    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
+    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
+    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
+
+    if (i0) {
+        * i0 = i0_;
+    }
+    if (i1) {
+        * i1 = i1_;
+    }
+    if (i2) {
+        * i2 = i2_;
+    }
+    if (i3) {
+        * i3 = i3_;
+    }
+}
+
+int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_ASSERT(false);
+            }
+    }
+
+    return 0.0f;
+}
+
+void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_TYPE_F16:
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+        case GGML_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_ASSERT(false);
+    }
+
+    return 0.0f;
+}
+
+void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_ASSERT(false);
+            }
+    }
+
+    return 0.0f;
+}
+
+void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_TYPE_F16:
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+        case GGML_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_ASSERT(false);
+    }
+
+    return 0.0f;
+}
+
+void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+void * ggml_get_data(const struct ggml_tensor * tensor) {
+    return tensor->data;
+}
+
+float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
+    assert(tensor->type == GGML_TYPE_F32);
+    return (float *)(tensor->data);
+}
+
+enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
+    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
+}
+
+const char * ggml_get_name(const struct ggml_tensor * tensor) {
+    return tensor->name;
+}
+
+struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
+    strncpy(tensor->name, name, sizeof(tensor->name));
+    tensor->name[sizeof(tensor->name) - 1] = '\0';
+    return tensor;
+}
+
+struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
+    va_end(args);
+    return tensor;
+}
+
+struct ggml_tensor * ggml_view_tensor(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * src) {
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
+    ggml_format_name(result, "%s (view)", src->name);
+
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        result->nb[i] = src->nb[i];
+    }
+
+    result->not_own_data = true;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TENSOR) {
+            struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
+            if (strcmp(cur->name, name) == 0) {
+                return cur;
+            }
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// ggml_dup
+
+static struct ggml_tensor * ggml_dup_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_DUP;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_dup(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    return ggml_dup_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_dup_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    return ggml_dup_impl(ctx, a, true);
+}
+
+// ggml_add
+
+static struct ggml_tensor * ggml_add_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        bool inplace) {
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        GGML_ASSERT(ggml_are_same_shape(a, b));
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_ADD;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_add_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_add_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_add_impl(ctx, a, b, true);
+}
+
+// ggml_add_cast
+
+static struct ggml_tensor * ggml_add_cast_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        enum   ggml_type     type) {
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
+    GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        // TODO: support backward pass for broadcasting
+        GGML_ASSERT(ggml_are_same_shape(a, b));
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
+
+    result->op   = GGML_OP_ADD;
+    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add_cast(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        enum   ggml_type     type) {
+    return ggml_add_cast_impl(ctx, a, b, type);
+}
+
+// ggml_add1
+
+static struct ggml_tensor * ggml_add1_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        bool inplace) {
+    GGML_ASSERT(ggml_is_scalar(b));
+    GGML_ASSERT(ggml_is_padded_1d(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_ADD1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add1(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_add1_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_add1_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_add1_impl(ctx, a, b, true);
+}
+
+// ggml_acc
+
+static struct ggml_tensor * ggml_acc_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        size_t               nb1,
+        size_t               nb2,
+        size_t               nb3,
+        size_t               offset,
+        bool inplace) {
+    GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(b->type == GGML_TYPE_F32);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_ACC;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_acc(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        size_t               nb1,
+        size_t               nb2,
+        size_t               nb3,
+        size_t               offset) {
+    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_tensor * ggml_acc_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        size_t               nb1,
+        size_t               nb2,
+        size_t               nb3,
+        size_t               offset) {
+    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+// ggml_sub
+
+static struct ggml_tensor * ggml_sub_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        bool inplace) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SUB;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_sub(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_sub_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_sub_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_sub_impl(ctx, a, b, true);
+}
+
+// ggml_mul
+
+static struct ggml_tensor * ggml_mul_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        bool inplace) {
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        GGML_ASSERT(ggml_are_same_shape(a, b));
+        is_node = true;
+    }
+
+    if (inplace) {
+        GGML_ASSERT(!is_node);
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_MUL;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_mul(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_mul_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_mul_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_mul_impl(ctx, a, b, true);
+}
+
+// ggml_div
+
+static struct ggml_tensor * ggml_div_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        bool inplace) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    if (inplace) {
+        GGML_ASSERT(!is_node);
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_DIV;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_div(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_div_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_div_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_div_impl(ctx, a, b, true);
+}
+
+// ggml_sqr
+
+static struct ggml_tensor * ggml_sqr_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SQR;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_sqr(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqr_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_sqr_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqr_impl(ctx, a, true);
+}
+
+// ggml_sqrt
+
+static struct ggml_tensor * ggml_sqrt_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SQRT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_sqrt(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqrt_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_sqrt_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqrt_impl(ctx, a, true);
+}
+
+// ggml_log
+
+static struct ggml_tensor * ggml_log_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_LOG;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_log(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_log_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_log_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_log_impl(ctx, a, true);
+}
+
+// ggml_sum
+
+struct ggml_tensor * ggml_sum(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
+
+    result->op   = GGML_OP_SUM;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_sum_rows
+
+struct ggml_tensor * ggml_sum_rows(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    int64_t ne[4] = {1,1,1,1};
+    for (int i=1; i<a->n_dims; ++i) {
+        ne[i] = a->ne[i];
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
+
+    result->op   = GGML_OP_SUM_ROWS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_mean
+
+struct ggml_tensor * ggml_mean(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement
+        is_node = true;
+    }
+
+    int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
+
+    result->op   = GGML_OP_MEAN;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_argmax
+
+struct ggml_tensor * ggml_argmax(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    GGML_ASSERT(ggml_is_matrix(a));
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false);
+        is_node = true;
+    }
+
+    int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
+
+    result->op   = GGML_OP_ARGMAX;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_repeat
+
+struct ggml_tensor * ggml_repeat(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    GGML_ASSERT(ggml_can_repeat(a, b));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
+
+    result->op   = GGML_OP_REPEAT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_repeat_back
+
+struct ggml_tensor * ggml_repeat_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    GGML_ASSERT(ggml_can_repeat(b, a));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    if (ggml_are_same_shape(a, b) && !is_node) {
+        return a;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
+
+    result->op   = GGML_OP_REPEAT_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_concat
+
+struct ggml_tensor * ggml_concat(
+    struct ggml_context* ctx,
+    struct ggml_tensor* a,
+    struct ggml_tensor* b) {
+    GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
+
+    result->op = GGML_OP_CONCAT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_abs
+
+struct ggml_tensor * ggml_abs(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
+}
+
+struct ggml_tensor * ggml_abs_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
+}
+
+// ggml_sgn
+
+struct ggml_tensor * ggml_sgn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
+}
+
+struct ggml_tensor * ggml_sgn_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
+}
+
+// ggml_neg
+
+struct ggml_tensor * ggml_neg(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
+}
+
+struct ggml_tensor * ggml_neg_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
+}
+
+// ggml_step
+
+struct ggml_tensor * ggml_step(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
+}
+
+struct ggml_tensor * ggml_step_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
+}
+
+// ggml_tanh
+
+struct ggml_tensor * ggml_tanh(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
+}
+
+struct ggml_tensor * ggml_tanh_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
+}
+
+// ggml_elu
+
+struct ggml_tensor * ggml_elu(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
+}
+
+struct ggml_tensor * ggml_elu_inplace(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
+}
+
+// ggml_relu
+
+struct ggml_tensor * ggml_relu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
+}
+
+struct ggml_tensor * ggml_relu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
+}
+
+// ggml_gelu
+
+struct ggml_tensor * ggml_gelu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
+}
+
+struct ggml_tensor * ggml_gelu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
+}
+
+// ggml_gelu_quick
+
+struct ggml_tensor * ggml_gelu_quick(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
+}
+
+struct ggml_tensor * ggml_gelu_quick_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
+}
+
+// ggml_silu
+
+struct ggml_tensor * ggml_silu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
+}
+
+struct ggml_tensor * ggml_silu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
+}
+
+// ggml_silu_back
+
+struct ggml_tensor * ggml_silu_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SILU_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_norm
+
+static struct ggml_tensor * ggml_norm_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float eps,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
+    result->op   = GGML_OP_NORM;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float eps) {
+    return ggml_norm_impl(ctx, a, eps, false);
+}
+
+struct ggml_tensor * ggml_norm_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float eps) {
+    return ggml_norm_impl(ctx, a, eps, true);
+}
+
+// ggml_rms_norm
+
+static struct ggml_tensor * ggml_rms_norm_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float eps,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
+    result->op   = GGML_OP_RMS_NORM;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_rms_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float  eps) {
+    return ggml_rms_norm_impl(ctx, a, eps, false);
+}
+
+struct ggml_tensor * ggml_rms_norm_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float eps) {
+    return ggml_rms_norm_impl(ctx, a, eps, true);
+}
+
+// ggml_rms_norm_back
+
+struct ggml_tensor * ggml_rms_norm_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        float  eps) {
+    bool is_node = false;
+
+    if (a->grad) {
+        // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
+    result->op   = GGML_OP_RMS_NORM_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_group_norm
+
+static struct ggml_tensor * ggml_group_norm_impl(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int n_groups,
+    bool inplace) {
+
+    bool is_node = false;
+    if (!inplace && (a->grad)) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op = GGML_OP_GROUP_NORM;
+    result->op_params[0] = n_groups;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = NULL; // TODO: maybe store epsilon here?
+
+    return result;
+}
+
+struct ggml_tensor * ggml_group_norm(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int n_groups) {
+    return ggml_group_norm_impl(ctx, a, n_groups, false);
+}
+
+struct ggml_tensor * ggml_group_norm_inplace(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int n_groups) {
+    return ggml_group_norm_impl(ctx, a, n_groups, true);
+}
+
+// ggml_mul_mat
+
+struct ggml_tensor * ggml_mul_mat(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_can_mul_mat(a, b));
+    GGML_ASSERT(!ggml_is_transposed(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
+
+    result->op   = GGML_OP_MUL_MAT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_out_prod
+
+struct ggml_tensor * ggml_out_prod(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_can_out_prod(a, b));
+    GGML_ASSERT(!ggml_is_transposed(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
+    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
+
+    result->op   = GGML_OP_OUT_PROD;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_scale
+
+static struct ggml_tensor * ggml_scale_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool inplace) {
+    GGML_ASSERT(ggml_is_scalar(b));
+    GGML_ASSERT(ggml_is_padded_1d(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SCALE;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_scale(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_scale_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_scale_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_scale_impl(ctx, a, b, true);
+}
+
+// ggml_set
+
+static struct ggml_tensor * ggml_set_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset,
+        bool inplace) {
+    GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // make a view of the destination
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_SET;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_set(
+        struct ggml_context * ctx,
+        struct ggml_tensor *  a,
+        struct ggml_tensor *  b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_tensor * ggml_set_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor *  a,
+        struct ggml_tensor *  b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+struct ggml_tensor * ggml_set_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor *  a,
+        struct ggml_tensor *  b,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_tensor * ggml_set_1d_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor *  a,
+        struct ggml_tensor *  b,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
+}
+
+struct ggml_tensor * ggml_set_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor *  a,
+        struct ggml_tensor *  b,
+        size_t                nb1,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_tensor * ggml_set_2d_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor *  a,
+        struct ggml_tensor *  b,
+        size_t                nb1,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+// ggml_cpy
+
+static struct ggml_tensor * ggml_cpy_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool inplace) {
+    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    // make a view of the destination
+    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
+    if (strlen(b->name) > 0) {
+        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
+    } else {
+        ggml_format_name(result, "%s (copy)", a->name);
+    }
+
+    result->op   = GGML_OP_CPY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_cpy(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_cpy_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_cpy_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_cpy_impl(ctx, a, b, true);
+}
+
+// ggml_cont
+
+static struct ggml_tensor * ggml_cont_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    ggml_format_name(result, "%s (cont)", a->name);
+
+    result->op   = GGML_OP_CONT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_cont(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    return ggml_cont_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_cont_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    return ggml_cont_impl(ctx, a, true);
+}
+
+// make contiguous, with new shape
+GGML_API struct ggml_tensor * ggml_cont_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0) {
+    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
+}
+
+GGML_API struct ggml_tensor * ggml_cont_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1) {
+    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
+}
+
+GGML_API struct ggml_tensor * ggml_cont_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2) {
+    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
+}
+
+struct ggml_tensor * ggml_cont_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3) {
+    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
+
+    bool is_node = false;
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
+    ggml_format_name(result, "%s (cont)", a->name);
+
+    result->op   = GGML_OP_CONT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_reshape
+
+struct ggml_tensor * ggml_reshape(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
+    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    if (b->grad) {
+        // gradient propagation is not supported
+        //GGML_ASSERT(false);
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_OP_RESHAPE;
+    result->not_own_data = true;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[1] = { ne0 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_OP_RESHAPE;
+    result->not_own_data = true;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[2] = { ne0, ne1 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_OP_RESHAPE;
+    result->not_own_data = true;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_OP_RESHAPE;
+    result->not_own_data = true;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op   = GGML_OP_RESHAPE;
+    result->not_own_data = true;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+static struct ggml_tensor * ggml_view_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_dims,
+        const int64_t       * ne,
+        size_t                offset) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
+    ggml_format_name(result, "%s (view)", a->name);
+
+    ggml_set_op_params(result, &offset, sizeof(offset));
+
+    result->op   = GGML_OP_VIEW;
+    result->not_own_data = true;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_view_1d
+
+struct ggml_tensor * ggml_view_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        size_t                offset) {
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
+
+    return result;
+}
+
+// ggml_view_2d
+
+struct ggml_tensor * ggml_view_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        size_t                nb1,
+        size_t                offset) {
+
+    const int64_t ne[2] = { ne0, ne1 };
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = result->nb[1]*ne1;
+    result->nb[3] = result->nb[2];
+
+    return result;
+}
+
+// ggml_view_3d
+
+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                offset) {
+
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = nb2;
+    result->nb[3] = result->nb[2]*ne2;
+
+    return result;
+}
+
+// ggml_view_4d
+
+struct ggml_tensor * ggml_view_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = nb2;
+    result->nb[3] = nb3;
+
+    return result;
+}
+
+// ggml_permute
+
+struct ggml_tensor * ggml_permute(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   axis0,
+        int                   axis1,
+        int                   axis2,
+        int                   axis3) {
+    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
+    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
+    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
+    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
+
+    GGML_ASSERT(axis0 != axis1);
+    GGML_ASSERT(axis0 != axis2);
+    GGML_ASSERT(axis0 != axis3);
+    GGML_ASSERT(axis1 != axis2);
+    GGML_ASSERT(axis1 != axis3);
+    GGML_ASSERT(axis2 != axis3);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    ggml_format_name(result, "%s (permuted)", a->name);
+
+    int ne[GGML_MAX_DIMS];
+    int nb[GGML_MAX_DIMS];
+
+    ne[axis0] = a->ne[0];
+    ne[axis1] = a->ne[1];
+    ne[axis2] = a->ne[2];
+    ne[axis3] = a->ne[3];
+
+    nb[axis0] = a->nb[0];
+    nb[axis1] = a->nb[1];
+    nb[axis2] = a->nb[2];
+    nb[axis3] = a->nb[3];
+
+    result->ne[0] = ne[0];
+    result->ne[1] = ne[1];
+    result->ne[2] = ne[2];
+    result->ne[3] = ne[3];
+
+    result->nb[0] = nb[0];
+    result->nb[1] = nb[1];
+    result->nb[2] = nb[2];
+    result->nb[3] = nb[3];
+
+    result->op   = GGML_OP_PERMUTE;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    int32_t params[] = { axis0, axis1, axis2, axis3 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    return result;
+}
+
+// ggml_transpose
+
+struct ggml_tensor * ggml_transpose(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    ggml_format_name(result, "%s (transposed)", a->name);
+
+    result->ne[0] = a->ne[1];
+    result->ne[1] = a->ne[0];
+
+    result->nb[0] = a->nb[1];
+    result->nb[1] = a->nb[0];
+
+    result->op   = GGML_OP_TRANSPOSE;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_get_rows
+
+struct ggml_tensor * ggml_get_rows(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // TODO: implement non F32 return
+    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
+
+    result->op   = GGML_OP_GET_ROWS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_get_rows_back
+
+struct ggml_tensor * ggml_get_rows_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c) {
+    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    // TODO: implement non F32 return
+    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
+
+    result->op   = GGML_OP_GET_ROWS_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_diag
+
+struct ggml_tensor * ggml_diag(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    GGML_ASSERT(a->ne[1] == 1);
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
+
+    result->op   = GGML_OP_DIAG;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_diag_mask_inf
+
+static struct ggml_tensor * ggml_diag_mask_inf_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[] = { n_past };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_DIAG_MASK_INF;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_diag_mask_inf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
+}
+
+struct ggml_tensor * ggml_diag_mask_inf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
+}
+
+// ggml_diag_mask_zero
+
+static struct ggml_tensor * ggml_diag_mask_zero_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[] = { n_past };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_DIAG_MASK_ZERO;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_diag_mask_zero(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
+}
+
+struct ggml_tensor * ggml_diag_mask_zero_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
+}
+
+// ggml_soft_max
+
+static struct ggml_tensor * ggml_soft_max_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SOFT_MAX;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_soft_max(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_max_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_soft_max_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_max_impl(ctx, a, true);
+}
+
+// ggml_soft_max_back
+
+static struct ggml_tensor * ggml_soft_max_back_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true; // TODO : implement backward pass
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SOFT_MAX_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_soft_max_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_soft_max_back_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_soft_max_back_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_soft_max_back_impl(ctx, a, b, true);
+}
+
+// ggml_rope
+
+static struct ggml_tensor * ggml_rope_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 xpos_base,
+        bool                  xpos_down,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] == b->ne[0]);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
+    memcpy(params + 4, &freq_base,  sizeof(float));
+    memcpy(params + 5, &freq_scale, sizeof(float));
+    memcpy(params + 6, &xpos_base,  sizeof(float));
+    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_ROPE;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_rope(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx) {
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
+}
+
+struct ggml_tensor * ggml_rope_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx) {
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
+}
+
+struct ggml_tensor * ggml_rope_custom(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        float                 freq_base,
+        float                 freq_scale) {
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
+}
+
+struct ggml_tensor * ggml_rope_custom_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        float                 freq_base,
+        float                 freq_scale) {
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
+}
+
+struct ggml_tensor * ggml_rope_xpos_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        float                 base,
+        bool                  down) {
+    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
+}
+
+// ggml_rope_back
+
+struct ggml_tensor * ggml_rope_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 xpos_base,
+        bool                  xpos_down) {
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] == b->ne[0]);
+
+    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = false; // TODO: implement backward
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
+    memcpy(params + 4, &freq_base,  sizeof(float));
+    memcpy(params + 5, &freq_scale, sizeof(float));
+    memcpy(params + 6, &xpos_base,  sizeof(float));
+    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_ROPE_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_alibi
+
+struct ggml_tensor * ggml_alibi(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_head,
+        float                 bias_max) {
+    GGML_ASSERT(n_past >= 0);
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // TODO: when implement backward, fix this:
+    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+    int32_t op_params[3] = { n_past, n_head };
+    memcpy(op_params + 2, &bias_max, sizeof(float));
+    ggml_set_op_params(result, op_params, sizeof(op_params));
+
+    result->op   = GGML_OP_ALIBI;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_clamp
+
+struct ggml_tensor * ggml_clamp(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 min,
+        float                 max) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // TODO: when implement backward, fix this:
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+    float params[] = { min, max };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_CLAMP;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_conv_1d
+
+static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+}
+
+// im2col: [N, IC, IL] => [N, OL, IC*K]
+// a: [OC，IC, K]
+// b: [N, IC, IL]
+// result: [N, OL, IC*K]
+static struct ggml_tensor * ggml_conv_1d_stage_0(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int                   s0,
+    int                   p0,
+    int                   d0) {
+    GGML_ASSERT(a->ne[1] == b->ne[1]);
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
+    const int64_t ne[4] = {
+        a->ne[1] * a->ne[0],
+        OL,
+        b->ne[2],
+        1,
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
+
+    int32_t params[] = { s0, p0, d0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_CONV_1D_STAGE_0;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_1d_stage_1
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// a: [OC, IC, K]
+// b: [N, OL, IC * K]
+// result: [N, OC, OL]
+static struct ggml_tensor * ggml_conv_1d_stage_1(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b) {
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        b->ne[1],
+        a->ne[2],
+        b->ne[2],
+        1,
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op = GGML_OP_CONV_1D_STAGE_1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_1d
+
+GGML_API struct ggml_tensor * ggml_conv_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
+    result = ggml_conv_1d_stage_1(ctx, a, result);
+    return result;
+}
+
+// GGML_API struct ggml_tensor * ggml_conv_1d(
+//         struct ggml_context * ctx,
+//         struct ggml_tensor  * a,
+//         struct ggml_tensor  * b,
+//         int                   s0,
+//         int                   p0,
+//         int                   d0) {
+//     GGML_ASSERT(ggml_is_matrix(b));
+//     GGML_ASSERT(a->ne[1] == b->ne[1]);
+//     bool is_node = false;
+
+//     if (a->grad || b->grad) {
+//         GGML_ASSERT(false); // TODO: implement backward
+//         is_node = true;
+//     }
+
+//     const int64_t ne[4] = {
+//         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+//         a->ne[2], 1, 1,
+//     };
+//     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+
+//     int32_t params[] = { s0, p0, d0 };
+//     ggml_set_op_params(result, params, sizeof(params));
+
+//     result->op = GGML_OP_CONV_1D;
+//     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+//     result->src[0] = a;
+//     result->src[1] = b;
+
+//     return result;
+// }
+
+// ggml_conv_1d_ph
+
+struct ggml_tensor* ggml_conv_1d_ph(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s,
+        int                   d) {
+    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
+}
+
+// ggml_conv_transpose_1d
+
+static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
+}
+
+GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    GGML_ASSERT(ggml_is_matrix(b));
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(a->ne[3] == 1);
+
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(d0 == 1);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { s0, p0, d0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_CONV_TRANSPOSE_1D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_2d
+
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+static struct ggml_tensor * ggml_conv_2d_stage_0(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
+
+    GGML_ASSERT(a->ne[2] == b->ne[2]);
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
+    const int64_t ne[4] = {
+        a->ne[2] * a->ne[1] * a->ne[0],
+        OW,
+        OH,
+        b->ne[3],
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
+
+    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_CONV_2D_STAGE_0;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+
+}
+
+// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+// a: [OC, IC, KH, KW]
+// b: [N, OH, OW, IC * KH * KW]
+// result: [N, OC, OH, OW]
+static struct ggml_tensor * ggml_conv_2d_stage_1(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b) {
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        b->ne[1],
+        b->ne[2],
+        a->ne[3],
+        b->ne[3],
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op = GGML_OP_CONV_2D_STAGE_1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+
+}
+
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct ggml_tensor * ggml_conv_2d(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
+
+    struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
+    result = ggml_conv_2d_stage_1(ctx, a, result);
+
+    return result;
+
+}
+
+// ggml_conv_2d_sk_p0
+struct ggml_tensor * ggml_conv_2d_sk_p0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
+}
+
+// ggml_conv_2d_s1_ph
+
+struct ggml_tensor * ggml_conv_2d_s1_ph(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
+}
+
+// ggml_conv_transpose_2d_p0
+
+static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
+    return (ins - 1) * s - 2 * p + ks;
+}
+
+struct ggml_tensor * ggml_conv_transpose_2d_p0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride) {
+    GGML_ASSERT(a->ne[3] == b->ne[2]);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
+        ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
+        a->ne[2], b->ne[3],
+    };
+
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    ggml_set_op_params_i32(result, 0, stride);
+
+    result->op = GGML_OP_CONV_TRANSPOSE_2D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_pool_*
+
+static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
+    return (ins + 2 * p - ks) / s + 1;
+}
+
+// ggml_pool_1d
+
+struct ggml_tensor * ggml_pool_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   s0,
+        int                   p0) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[3] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        a->ne[1],
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+
+    int32_t params[] = { op, k0, s0, p0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_POOL_1D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_pool_2d
+
+struct ggml_tensor * ggml_pool_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[3] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
+        a->ne[2],
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+
+    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_POOL_2D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_upscale
+
+static struct ggml_tensor * ggml_upscale_impl(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int scale_factor) {
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] * scale_factor,
+            a->ne[1] * scale_factor,
+            a->ne[2], a->ne[3]);
+
+    result->op = GGML_OP_UPSCALE;
+    result->op_params[0] = scale_factor;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_upscale(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int scale_factor) {
+    return ggml_upscale_impl(ctx, a, scale_factor);
+}
+
+// ggml_flash_attn
+
+struct ggml_tensor * ggml_flash_attn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        bool                  masked) {
+    GGML_ASSERT(ggml_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+
+    bool is_node = false;
+
+    if (q->grad || k->grad || v->grad) {
+        is_node = true;
+    }
+
+    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
+
+    int32_t t = masked ? 1 : 0;
+    ggml_set_op_params(result, &t, sizeof(t));
+
+    result->op   = GGML_OP_FLASH_ATTN;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+
+    return result;
+}
+
+// ggml_flash_ff
+
+struct ggml_tensor * ggml_flash_ff(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b0,
+        struct ggml_tensor  * b1,
+        struct ggml_tensor  * c0,
+        struct ggml_tensor  * c1) {
+    GGML_ASSERT(ggml_can_mul_mat(b0, a));
+    // TODO: more checks
+
+    bool is_node = false;
+
+    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
+        is_node = true;
+    }
+
+    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
+
+    result->op   = GGML_OP_FLASH_FF;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b0;
+    result->src[2] = b1;
+    result->src[3] = c0;
+    result->src[4] = c1;
+
+    return result;
+}
+
+// ggml_flash_attn_back
+
+struct ggml_tensor * ggml_flash_attn_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * d,
+        bool                  masked) {
+    GGML_ASSERT(ggml_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+
+    // d shape [D,N,ne2,ne3]
+    // q shape [D,N,ne2,ne3]
+    // k shape [D,M,kvne2,ne3]
+    // v shape [M,D,kvne2,ne3]
+
+    const int64_t     D = q->ne[0];
+    const int64_t     N = q->ne[1];
+    const int64_t     M = k->ne[1];
+    const int64_t   ne2 = q->ne[2];
+    const int64_t   ne3 = q->ne[3];
+    const int64_t kvne2 = k->ne[2];
+
+    GGML_ASSERT(k->ne[0] == D);
+    GGML_ASSERT(v->ne[0] == M);
+    GGML_ASSERT(v->ne[1] == D);
+    GGML_ASSERT(d->ne[0] == D);
+    GGML_ASSERT(d->ne[1] == N);
+    GGML_ASSERT(k->ne[2] == kvne2);
+    GGML_ASSERT(k->ne[3] == ne3);
+    GGML_ASSERT(v->ne[2] == kvne2);
+    GGML_ASSERT(v->ne[3] == ne3);
+    GGML_ASSERT(d->ne[2] == ne2);
+    GGML_ASSERT(d->ne[3] == ne3);
+
+    GGML_ASSERT(ne2 % kvne2 == 0);
+
+    bool is_node = false;
+
+    if (q->grad || k->grad || v->grad) {
+        // when using this operation (in backwards pass) these grads are set.
+        // we don't want to create (big) grad of our result, so is_node is false.
+        is_node = false;
+    }
+
+    // store gradients of q, k and v as continuous tensors concatenated in result.
+    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
+    const int64_t elem_q = ggml_nelements(q);
+    const int64_t elem_k = ggml_nelements(k);
+    const int64_t elem_v = ggml_nelements(v);
+
+    enum ggml_type result_type = GGML_TYPE_F32;
+    GGML_ASSERT(ggml_blck_size(result_type) == 1);
+    const size_t tsize = ggml_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
+
+    const size_t nelements = (end + tsize - 1)/tsize;
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
+
+    int32_t masked_i = masked ? 1 : 0;
+    ggml_set_op_params(result, &masked_i, sizeof(masked_i));
+
+    result->op   = GGML_OP_FLASH_ATTN_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = d;
+
+    return result;
+}
+
+// ggml_win_part
+
+struct ggml_tensor * ggml_win_part(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   w) {
+    GGML_ASSERT(a->ne[3] == 1);
+    GGML_ASSERT(a->type  == GGML_TYPE_F32);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // padding
+    const int px = (w - a->ne[1]%w)%w;
+    const int py = (w - a->ne[2]%w)%w;
+
+    const int npx = (px + a->ne[1])/w;
+    const int npy = (py + a->ne[2])/w;
+    const int np  = npx*npy;
+
+    const int64_t ne[4] = { a->ne[0], w, w, np, };
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { npx, npy, w };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_WIN_PART;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_win_unpart
+
+struct ggml_tensor * ggml_win_unpart(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   w0,
+        int                   h0,
+        int                   w) {
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+
+    int32_t params[] = { w };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_WIN_UNPART;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_get_rel_pos
+
+struct ggml_tensor * ggml_get_rel_pos(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   qh,
+        int                   kh) {
+    GGML_ASSERT(qh == kh);
+    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
+
+    result->op   = GGML_OP_GET_REL_POS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
+
+    return result;
+}
+
+// ggml_add_rel_pos
+
+static struct ggml_tensor * ggml_add_rel_pos_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pw,
+        struct ggml_tensor  * ph,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_are_same_shape(pw, ph));
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_is_contiguous(pw));
+    GGML_ASSERT(ggml_is_contiguous(ph));
+    GGML_ASSERT(ph->type == GGML_TYPE_F32);
+    GGML_ASSERT(pw->type == GGML_TYPE_F32);
+    GGML_ASSERT(pw->ne[3] == a->ne[2]);
+    GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
+    GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || pw->grad || ph->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
+
+    result->op   = GGML_OP_ADD_REL_POS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = pw;
+    result->src[2] = ph;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add_rel_pos(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pw,
+        struct ggml_tensor  * ph) {
+    return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
+}
+
+struct ggml_tensor * ggml_add_rel_pos_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pw,
+        struct ggml_tensor  * ph) {
+    return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
+}
+
+// gmml_unary
+
+static struct ggml_tensor * ggml_unary_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        enum ggml_unary_op op,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_i32(result, 0, (int32_t) op);
+
+    result->op   = GGML_OP_UNARY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_unary(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op) {
+    return ggml_unary_impl(ctx, a, op, false);
+}
+
+struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op) {
+    return ggml_unary_impl(ctx, a, op, true);
+}
+
+// ggml_map_unary
+
+static struct ggml_tensor * ggml_map_unary_impl_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_OP_MAP_UNARY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_unary_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_tensor * ggml_map_unary_inplace_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_map_binary
+
+static struct ggml_tensor * ggml_map_binary_impl_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun,
+        bool   inplace) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_OP_MAP_BINARY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_binary_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_tensor * ggml_map_binary_inplace_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_map_custom1_f32
+
+static struct ggml_tensor * ggml_map_custom1_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_OP_MAP_CUSTOM1_F32;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom1_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun) {
+    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom1_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun) {
+    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_map_custom2_f32
+
+static struct ggml_tensor * ggml_map_custom2_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_OP_MAP_CUSTOM2_F32;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom2_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun) {
+    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom2_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun) {
+    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_map_custom3_f32
+
+static struct ggml_tensor * ggml_map_custom3_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad || c->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op = GGML_OP_MAP_CUSTOM3_F32;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom3_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun) {
+    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom3_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun) {
+    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
+}
+
+// ggml_map_custom1
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_tensor * ggml_map_custom1_impl(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom1_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_OP_MAP_CUSTOM1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom1(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom1_inplace(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
+}
+
+// ggml_map_custom2
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_tensor * ggml_map_custom2_impl(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom2_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_OP_MAP_CUSTOM2;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom2(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom2_inplace(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
+}
+
+// ggml_map_custom3
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+static struct ggml_tensor * ggml_map_custom3_impl(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata,
+        bool                           inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad || c->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom3_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op = GGML_OP_MAP_CUSTOM3;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom3(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom3_inplace(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_t       fun,
+        int                            n_tasks,
+        void                         * userdata) {
+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
+}
+
+// ggml_cross_entropy_loss
+
+struct ggml_tensor * ggml_cross_entropy_loss(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
+
+    result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_cross_entropy_loss_back
+
+struct ggml_tensor * ggml_cross_entropy_loss_back(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        struct ggml_tensor          * c) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+    GGML_ASSERT(ggml_is_scalar(c));
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
+    result->grad = NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_set_param(
+        struct ggml_context * ctx,
+        struct ggml_tensor * tensor) {
+    tensor->is_param = true;
+
+    GGML_ASSERT(tensor->grad == NULL);
+    tensor->grad = ggml_dup_tensor(ctx, tensor);
+}
+
+// ggml_compute_forward_dup
+
+static void ggml_compute_forward_dup_same_cont(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+    GGML_ASSERT(src0->type == dst->type);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb0 = dst->nb[0];
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by elements
+    const int ne = ggml_nelements(dst);
+    const int dr = (ne + nth - 1) / nth;
+    const int ie0 = dr * ith;
+    const int ie1 = MIN(ie0 + dr, ne);
+
+    if (ie0 < ie1) {
+        memcpy(
+            ((char *)  dst->data + ie0*nb0),
+            ((char *) src0->data + ie0*nb00),
+            (ie1 - ie0) * ggml_type_size(src0->type));
+    }
+
+}
+static void ggml_compute_forward_dup_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
+        ggml_compute_forward_dup_same_cont(params, src0, dst);
+        return;
+    }
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+    if (ggml_is_contiguous(dst)) {
+        if (nb00 == sizeof(ggml_fp16_t)) {
+            if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (type_traits[dst->type].from_float) {
+                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                            }
+
+                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ASSERT(false); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ASSERT(false); // TODO: implement
+            }
+        }
+        return;
+    }
+
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ASSERT(false); // TODO: implement
+    }
+}
+
+static void ggml_compute_forward_dup_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
+        ggml_compute_forward_dup_same_cont(params, src0, dst);
+        return;
+    }
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_is_contiguous(dst)) {
+        // TODO: simplify
+        if (nb00 == sizeof(float)) {
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (type_traits[dst->type].from_float) {
+                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ASSERT(false); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ASSERT(false); // TODO: implement
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(float));
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ASSERT(false); // TODO: implement
+    }
+}
+
+static void ggml_compute_forward_dup(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
+        ggml_compute_forward_dup_same_cont(params, src0, dst);
+        return;
+    }
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_dup_f16(params, src0, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_dup_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_add
+
+static void ggml_compute_forward_add_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+#ifdef GGML_USE_ACCELERATE
+            vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
+#else
+            ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+#endif
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int i0 = 0; i0 < ne0; i0++) {
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_add_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_compute_forward_add_f16_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(ggml_fp16_t)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_compute_forward_add_q_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+    const enum ggml_type dtype = dst->type;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+    ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        // src1 and dst are same shape as src0 => same indices
+        const int i13 = i03;
+        const int i12 = i02;
+        const int i11 = i01;
+
+        const int i3 = i03;
+        const int i2 = i02;
+        const int i1 = i01;
+
+        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
+        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
+
+        assert(ne00 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne00);
+        // add src1
+        ggml_vec_acc_f32(ne00, wdata, src1_row);
+        // quantize row to dst
+        if (quantize_row_q != NULL) {
+            quantize_row_q(wdata, dst_row, ne00);
+        } else {
+            memcpy(dst_row, wdata, ne0*nb0);
+        }
+    }
+}
+
+static void ggml_compute_forward_add(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F16) {
+                    ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
+                }
+                else {
+                    GGML_ASSERT(false);
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            {
+                ggml_compute_forward_add_q_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_add1
+
+static void ggml_compute_forward_add1_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+        UNUSED(ggml_vec_add1_f32);
+
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                (float *) ((char *) src1->data), 0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                ne0);
+#else
+        ggml_vec_add1_f32(ne0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+               *(float *) src1->data);
+#endif
+    }
+}
+
+static void ggml_compute_forward_add1_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_f16_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_q_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
+
+    // we don't support permuted src0
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(dst->type == src0->type);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
+        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
+
+        assert(ne0 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne0);
+        // add src1
+        ggml_vec_acc1_f32(ne0, wdata, v);
+        // quantize row to dst
+        quantize_row_q(wdata, dst_row, ne0);
+    }
+}
+
+static void ggml_compute_forward_add1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add1_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F16) {
+                    ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
+                }
+                else {
+                    GGML_ASSERT(false);
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            {
+                ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_acc
+
+static void ggml_compute_forward_acc_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during acc
+    // nb0 is implicitely element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace && (params->type == GGML_TASK_INIT)) {
+        // memcpy needs to be synchronized across threads to avoid race conditions.
+        // => do it in INIT phase
+        memcpy(
+            ((char *)  dst->data),
+            ((char *) src0->data),
+            ggml_nbytes(dst));
+    }
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during acc
+    const size_t nb0 = ggml_element_size(src0);
+
+    const size_t nb00 = nb0;
+    const size_t nb01 = nb1;
+    const size_t nb02 = nb2;
+    const size_t nb03 = nb3;
+
+    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
+    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+#ifdef GGML_USE_ACCELERATE
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
+#else
+        ggml_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+    }
+}
+
+static void ggml_compute_forward_acc(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_acc_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_sub
+
+static void ggml_compute_forward_sub_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = 0; ir < nr; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+            vDSP_vsub(
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                    ne0);
+#else
+            ggml_vec_sub_f32(ne0,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+                // }
+            // }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = 0; ir < nr; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            for (int i0 = 0; i0 < ne0; i0++) {
+                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_sub(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sub_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_mul
+
+static void ggml_compute_forward_mul_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+#ifdef GGML_USE_CLBLAST
+    if (src1->backend == GGML_BACKEND_GPU) {
+        if (ith == 0) {
+            ggml_cl_mul(src0, src1, dst);
+        }
+        return;
+    }
+#endif
+
+    const int64_t nr = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(ne00 == ne10);
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+#ifdef GGML_USE_ACCELERATE
+            UNUSED(ggml_vec_mul_f32);
+
+            vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr,  1, ne00);
+#else
+            ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+#endif
+                // }
+            // }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; i0++) {
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mul(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_mul_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_div
+
+static void ggml_compute_forward_div_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = 0; ir < nr; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+            UNUSED(ggml_vec_div_f32);
+
+            vDSP_vdiv(
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                    ne0);
+#else
+            ggml_vec_div_f32(ne0,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+                // }
+            // }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = 0; ir < nr; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            for (int i0 = 0; i0 < ne0; i0++) {
+                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_div(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_div_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_sqr
+
+static void ggml_compute_forward_sqr_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n     = ggml_nrows(src0);
+    const int nc    = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sqr_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sqr(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sqr_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_sqrt
+
+static void ggml_compute_forward_sqrt_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sqrt_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sqrt(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sqrt_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_log
+
+static void ggml_compute_forward_log_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_log_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_log(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_log_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_sum
+
+static void ggml_compute_forward_sum_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_is_scalar(dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(ggml_is_scalar(dst));
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    ggml_float sum     = 0;
+    ggml_float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f32_ggf(ne00,
+                        &row_sum,
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((float *) dst->data)[0] = sum;
+}
+
+static void ggml_compute_forward_sum_f16(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+          struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_is_scalar(dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f16_ggf(ne00,
+                    &row_sum,
+                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
+}
+
+static void ggml_compute_forward_sum(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sum_f32(params, src0, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_sum_f16(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_sum_rows
+
+static void ggml_compute_forward_sum_rows_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(ne0 == 1);
+    GGML_ASSERT(ne1 == ne01);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    for (int64_t i3 = 0; i3 < ne03; i3++) {
+        for (int64_t i2 = 0; i2 < ne02; i2++) {
+            for (int64_t i1 = 0; i1 < ne01; i1++) {
+                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
+                float row_sum = 0;
+                ggml_vec_sum_f32(ne00, &row_sum, src_row);
+                dst_row[0] = row_sum;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_sum_rows(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sum_rows_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_mean
+
+static void ggml_compute_forward_mean_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    assert(ne0 == 1);
+    assert(ne1 == ne01);
+    assert(ne2 == ne02);
+    assert(ne3 == ne03);
+
+    UNUSED(ne0);
+    UNUSED(ne1);
+    UNUSED(ne2);
+    UNUSED(ne3);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f32(ne00,
+                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+
+                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mean(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_mean_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_argmax
+
+static void ggml_compute_forward_argmax_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+    assert(dst->nb[0] == sizeof(float));
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+
+    const size_t nb01 = src0->nb[1];
+    const size_t nb0 = dst->nb[0];
+
+    for (int64_t i1 = 0; i1 < ne01; i1++) {
+        float * src = (float *) ((char *) src0->data + i1*nb01);
+        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
+        int v = 0;
+        ggml_vec_argmax_f32(ne00, &v, src);
+        dst_[0] = v;
+    }
+}
+
+static void ggml_compute_forward_argmax(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_argmax_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_repeat
+
+static void ggml_compute_forward_repeat_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_can_repeat(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_vec_cpy_f32(ne00,
+                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
+                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_can_repeat(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
+                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
+                                // ggml_vec_cpy_f16(ne00, y, x)
+                                for (int i = 0; i < ne00; ++i) {
+                                    y[i]  = x[i];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_repeat_f16(params, src0, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_repeat_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_repeat_back
+
+static void ggml_compute_forward_repeat_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_can_repeat(dst, src0));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne00/ne0);
+    const int nr1 = (int)(ne01/ne1);
+    const int nr2 = (int)(ne02/ne2);
+    const int nr3 = (int)(ne03/ne3);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (ggml_is_contiguous(dst)) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    } else {
+        for         (int k3 = 0; k3 < ne3; k3++) {
+            for     (int k2 = 0; k2 < ne2; k2++) {
+                for (int k1 = 0; k1 < ne1; k1++) {
+                    ggml_vec_set_f32(ne0,
+                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
+                        0);
+                }
+            }
+        }
+    }
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3; i3++) {
+        for                     (int k3 = 0; k3 < ne3; k3++) {
+            for                 (int i2 = 0; i2 < nr2; i2++) {
+                for             (int k2 = 0; k2 < ne2; k2++) {
+                    for         (int i1 = 0; i1 < nr1; i1++) {
+                        for     (int k1 = 0; k1 < ne1; k1++) {
+                            for (int i0 = 0; i0 < nr0; i0++) {
+                                ggml_vec_acc_f32(ne0,
+                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
+                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_repeat_back_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_concat
+
+static void ggml_compute_forward_concat_f32(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    const struct ggml_tensor * src1,
+    struct ggml_tensor * dst) {
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = ith; i2 < ne2; i2++) {
+            if (i2 < ne02) { // src0
+                for (int i1 = 0; i1 < ne1; i1++) {
+                    for (int i0 = 0; i0 < ne0; i0++) {
+                        const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
+
+                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
+                        *y = *x;
+                    }
+                }
+            } // src1
+            else {
+                for (int i1 = 0; i1 < ne1; i1++) {
+                    for (int i0 = 0; i0 < ne0; i0++) {
+                        const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
+
+                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
+                        *y = *x;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_concat(
+    const struct ggml_compute_params* params,
+    const struct ggml_tensor* src0,
+    const struct ggml_tensor* src1,
+    struct ggml_tensor* dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_concat_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_abs
+
+static void ggml_compute_forward_abs_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_abs_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_abs(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_abs_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_sgn
+
+static void ggml_compute_forward_sgn_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sgn_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sgn(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sgn_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_neg
+
+static void ggml_compute_forward_neg_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_neg_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_neg(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_neg_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_step
+
+static void ggml_compute_forward_step_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_step_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_step(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_step_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_tanh
+
+static void ggml_compute_forward_tanh_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_tanh_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_tanh(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_tanh_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_elu
+
+static void ggml_compute_forward_elu_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_elu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_elu(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_elu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_relu
+
+static void ggml_compute_forward_relu_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_relu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_relu(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_relu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_gelu
+
+static void ggml_compute_forward_gelu_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_gelu_quick
+
+static void ggml_compute_forward_gelu_quick_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_quick_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu_quick(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_quick_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_silu
+
+static void ggml_compute_forward_silu_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_silu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_silu(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_silu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_silu_back
+
+static void ggml_compute_forward_silu_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * grad,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, grad));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_silu_backward_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])),
+                (float *) ((char *) grad->data + i1*(grad->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_silu_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * grad,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_silu_back_f32(params, src0, grad, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_norm
+
+static void ggml_compute_forward_norm_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_float)x[i00];
+                }
+
+                float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_float sum2 = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    float v = x[i00] - mean;
+                    y[i00] = v;
+                    sum2 += (ggml_float)(v*v);
+                }
+
+                float variance = sum2/ne00;
+                const float scale = 1.0f/sqrtf(variance + eps);
+
+                ggml_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_norm(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_norm_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_group_rms_norm
+
+static void ggml_compute_forward_rms_norm_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_float)(x[i00] * x[i00]);
+                }
+
+                const float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                memcpy(y, x, ne00 * sizeof(float));
+                // for (int i00 = 0; i00 < ne00; i00++) {
+                //     y[i00] = x[i00];
+                // }
+
+                const float scale = 1.0f/sqrtf(mean + eps);
+
+                ggml_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rms_norm(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rms_norm_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_rms_norm_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                // src1 is same shape as src0 => same indices
+                const int64_t i11 = i01;
+                const int64_t i12 = i02;
+                const int64_t i13 = i03;
+
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+
+                ggml_float sum_xx  = 0.0;
+                ggml_float sum_xdz = 0.0;
+
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
+                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
+                }
+
+                //const float mean     = (float)(sum_xx)/ne00;
+                const float mean_eps = (float)(sum_xx)/ne00 + eps;
+                const float sum_eps  = (float)(sum_xx) + eps*ne00;
+                //const float mean_xdz = (float)(sum_xdz)/ne00;
+                // we could cache rms from forward pass to improve performance.
+                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
+                //const float rms      = sqrtf(mean_eps);
+                const float rrms     = 1.0f / sqrtf(mean_eps);
+                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
+
+                {
+                    // z = rms_norm(x)
+                    //
+                    // rms_norm(src0) =
+                    //     scale(
+                    //         src0,
+                    //         div(
+                    //             1,
+                    //             sqrt(
+                    //                 add(
+                    //                     scale(
+                    //                         sum(
+                    //                             sqr(
+                    //                                 src0)),
+                    //                         (1.0/N)),
+                    //                     eps))));
+
+                    // postorder:
+                    // ## op    args         grad
+                    // 00 param src0         grad[#00]
+                    // 01 const 1
+                    // 02 sqr   (#00)        grad[#02]
+                    // 03 sum   (#02)        grad[#03]
+                    // 04 const 1/N
+                    // 05 scale (#03, #04)   grad[#05]
+                    // 06 const eps
+                    // 07 add   (#05, #06)   grad[#07]
+                    // 08 sqrt  (#07)        grad[#08]
+                    // 09 div   (#01,#08)    grad[#09]
+                    // 10 scale (#00,#09)    grad[#10]
+                    //
+                    // backward pass, given grad[#10]
+                    // #10: scale
+                    // grad[#00] += scale(grad[#10],#09)
+                    // grad[#09] += sum(mul(grad[#10],#00))
+                    // #09: div
+                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
+                    // #08: sqrt
+                    // grad[#07] += mul(grad[#08], div(0.5, #08))
+                    // #07: add
+                    // grad[#05] += grad[#07]
+                    // #05: scale
+                    // grad[#03] += scale(grad[#05],#04)
+                    // #03: sum
+                    // grad[#02] += repeat(grad[#03], #02)
+                    // #02:
+                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
+                    //
+                    // substitute and simplify:
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#02] = repeat(grad[#03], #02)
+                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
+                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
+                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
+                    // a = b*c + d*e
+                    // a = b*c*f/f + d*e*f/f
+                    // a = (b*c*f + d*e*f)*(1/f)
+                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
+                    // a = (b + d*e/c)*c
+                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
+                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
+                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
+                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
+                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
+                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
+                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                }
+                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                // post-order:
+                // dx := x
+                // dx := scale(dx,-mean_xdz/mean_eps)
+                // dx := add(dx, dz)
+                // dx := scale(dx, rrms)
+                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_vec_cpy_f32  (ne00, dx, x);
+                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
+                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
+                ggml_vec_acc_f32  (ne00, dx, dz);
+                ggml_vec_scale_f32(ne00, dx, rrms);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rms_norm_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_group_norm
+
+static void ggml_compute_forward_group_norm_f32(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const float eps = 1e-6f; // TODO: make this a parameter
+
+    // TODO: optimize
+
+    int n_channels = src0->ne[2];
+    int n_groups = dst->op_params[0];
+    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
+    for (int i = ith; i < n_groups; i+=nth) {
+        int start = i * n_channels_per_group;
+        int end = start + n_channels_per_group;
+        if (end > n_channels) {
+            end = n_channels;
+        }
+        int step = end - start;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            ggml_float sum = 0.0;
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        sum += (ggml_float)x[i00];
+                    }
+                }
+            }
+            float mean = sum / (ne00 * ne01 * step);
+            ggml_float sum2 = 0.0;
+
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        float v = x[i00] - mean;
+                        y[i00] = v;
+                        sum2 += (ggml_float)(v * v);
+                    }
+                }
+            }
+            float variance = sum2 / (ne00 * ne01 * step);
+            const float scale = 1.0f / sqrtf(variance + eps);
+
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+                    ggml_vec_scale_f32(ne00, y, scale);
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_group_norm(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_group_norm_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_mul_mat
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+// helper function to determine if it is better to use BLAS or not
+// for large matrices, BLAS is faster
+static bool ggml_compute_forward_mul_mat_use_blas(
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    //const int64_t ne00 = src0->ne[0];
+    //const int64_t ne01 = src0->ne[1];
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if (ggml_is_contiguous(src0) &&
+        ggml_is_contiguous(src1) &&
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+
+        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
+        return true;
+    }
+
+    return false;
+}
+#endif
+
+static void ggml_compute_forward_mul_mat(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
+    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
+    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+#if defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+        if (params->ith != 0) {
+            return;
+        }
+
+        if (params->type == GGML_TASK_INIT) {
+            return;
+        }
+
+        if (params->type == GGML_TASK_FINALIZE) {
+            return;
+        }
+
+        for (int64_t i13 = 0; i13 < ne13; i13++) {
+            for (int64_t i12 = 0; i12 < ne12; i12++) {
+                // broadcast src0 into src1 across 2nd,3rd dimension
+                const int64_t i03 = i13/r3;
+                const int64_t i02 = i12/r2;
+
+                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
+                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+
+                float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+                if (type != GGML_TYPE_F32) {
+                            float * const wdata    = params->wdata;
+                    ggml_to_float_t const to_float = type_traits[type].to_float;
+
+                    size_t id = 0;
+                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
+                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
+                        id += ne00;
+                    }
+
+                    assert(id*sizeof(float) <= params->wsize);
+                    x = wdata;
+                }
+
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne00,
+                        0.0f,    d, ne01);
+            }
+        }
+
+        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+
+        return;
+    }
+#endif
+
+    if (params->type == GGML_TASK_INIT) {
+        if (src1->type != vec_dot_type) {
+            char * wdata = params->wdata;
+            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+
+            for (int64_t i13 = 0; i13 < ne13; ++i13) {
+                for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                        wdata += row_size;
+                    }
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+
+    const int64_t nr0 = ne01;           // src0 rows
+    const int64_t nr1 = ne11*ne12*ne13; // src1 rows
+
+    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+
+    // distribute the thread work across the inner or outer loop based on which one is larger
+
+    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+
+    const int64_t ith0 = ith % nth0;
+    const int64_t ith1 = ith / nth0;
+
+    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
+
+    const int64_t ir010 = dr0*ith0;
+    const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+    const int64_t ir110 = dr1*ith1;
+    const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir010 >= ir011 || ir110 >= ir111) {
+        sched_yield();
+        return;
+    }
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    float tmp[16];
+
+    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+                const int64_t i13 = (ir1/(ne12*ne11));
+                const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
+                const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13/r3;
+                const int64_t i02 = i12/r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+                     : (i11*nb11 + i12*nb12 + i13*nb13));
+
+                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+                }
+                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_out_prod
+
+static void ggml_compute_forward_out_prod_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    // int64_t t0 = ggml_perf_time_us();
+    // UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
+    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+
+    if (params->type == GGML_TASK_INIT) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // block-tiling attempt
+    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
+    const int64_t blck_1 = 16;
+
+    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
+        const int64_t bir1 = MIN(bir + blck_1, ir1);
+        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
+            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
+            for (int64_t ir = bir; ir < bir1; ++ir) {
+                // dst indices
+                const int64_t i3 = ir/(ne2*ne1);
+                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                const int64_t i02 = i2;
+                const int64_t i03 = i3;
+
+                //const int64_t i10 = i1;
+                const int64_t i12 = i2;
+                const int64_t i13 = i3;
+
+#if GGML_VEC_MAD_UNROLL > 2
+                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
+                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
+                }
+                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#else
+                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#endif
+            }
+        }
+    }
+
+    //int64_t t1 = ggml_perf_time_us();
+    //static int64_t acc = 0;
+    //acc += t1 - t0;
+    //if (t1 - t0 > 10) {
+    //    printf("\n");
+    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
+
+    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
+    //}
+}
+
+static void ggml_compute_forward_out_prod_q_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    // int64_t t0 = ggml_perf_time_us();
+    // UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // we don't support permuted src0 dim0
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+
+    // dst dim0 cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
+    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+
+    if (params->type == GGML_TASK_INIT) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        // dst indices
+        const int64_t i3 = ir/(ne2*ne1);
+        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        const int64_t i02 = i2;
+        const int64_t i03 = i3;
+
+        //const int64_t i10 = i1;
+        const int64_t i12 = i2;
+        const int64_t i13 = i3;
+
+        for (int64_t i01 = 0; i01 < ne01; ++i01) {
+            const int64_t i11 = i01;
+
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+            dequantize_row_q(s0, wdata, ne0);
+            ggml_vec_mad_f32(ne0, d, wdata, *s1);
+        }
+    }
+
+    //int64_t t1 = ggml_perf_time_us();
+    //static int64_t acc = 0;
+    //acc += t1 - t0;
+    //if (t1 - t0 > 10) {
+    //    printf("\n");
+    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
+
+    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
+    //}
+}
+
+static void ggml_compute_forward_out_prod(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            {
+                ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(false); // todo
+                // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_out_prod_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_scale
+
+static void ggml_compute_forward_scale_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // scale factor
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb1 = dst->nb[1];
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        if (dst->data != src0->data) {
+            // src0 is same shape as dst => same indices
+            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
+        }
+        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
+    }
+}
+
+static void ggml_compute_forward_scale(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_scale_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_set
+
+static void ggml_compute_forward_set_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during set
+    // nb0 is implicitely element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace && (params->type == GGML_TASK_INIT)) {
+        // memcpy needs to be synchronized across threads to avoid race conditions.
+        // => do it in INIT phase
+        memcpy(
+            ((char *)  dst->data),
+            ((char *) src0->data),
+            ggml_nbytes(dst));
+    }
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during set
+    const size_t nb0 = ggml_element_size(src0);
+
+    const int im0 = (ne10 == 0 ? 0 : ne10-1);
+    const int im1 = (ne11 == 0 ? 0 : ne11-1);
+    const int im2 = (ne12 == 0 ? 0 : ne12-1);
+    const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+        ggml_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+    }
+}
+
+static void ggml_compute_forward_set(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_set_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_cpy
+
+static void ggml_compute_forward_cpy(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    ggml_compute_forward_dup(params, src0, dst);
+}
+
+// ggml_compute_forward_cont
+
+static void ggml_compute_forward_cont(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    ggml_compute_forward_dup(params, src0, dst);
+}
+
+// ggml_compute_forward_reshape
+
+static void ggml_compute_forward_reshape(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_view
+
+static void ggml_compute_forward_view(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+}
+
+// ggml_compute_forward_permute
+
+static void ggml_compute_forward_permute(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+}
+
+// ggml_compute_forward_transpose
+
+static void ggml_compute_forward_transpose(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0) {
+    // NOP
+    UNUSED(params);
+    UNUSED(src0);
+}
+
+// ggml_compute_forward_get_rows
+
+static void ggml_compute_forward_get_rows_q(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+
+    assert( dst->ne[0] == nc);
+    assert( dst->ne[1] == nr);
+    assert(src0->nb[0] == ggml_type_size(type));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        dequantize_row_q(
+                (const void *) ((char *) src0->data + r*src0->nb[1]),
+                     (float *) ((char *)  dst->data + i*dst->nb[1]), nc);
+    }
+}
+
+static void ggml_compute_forward_get_rows_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    assert( dst->ne[0] == nc);
+    assert( dst->ne[1] == nr);
+    assert(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        for (int j = 0; j < nc; ++j) {
+            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
+            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
+        }
+    }
+}
+
+static void ggml_compute_forward_get_rows_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    assert( dst->ne[0] == nc);
+    assert( dst->ne[1] == nr);
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        ggml_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i*dst->nb[1]),
+                (float *) ((char *) src0->data + r*src0->nb[1]));
+    }
+}
+
+static void ggml_compute_forward_get_rows(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            {
+                ggml_compute_forward_get_rows_q(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_compute_forward_get_rows_back
+
+static void ggml_compute_forward_get_rows_back_f32_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    GGML_ASSERT( dst->ne[0] == nc);
+    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        for (int j = 0; j < nc; ++j) {
+            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
+            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
+        }
+    }
+}
+
+static void ggml_compute_forward_get_rows_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    GGML_ASSERT( dst->ne[0] == nc);
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        ggml_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *) src0->data + i*src0->nb[1]));
+    }
+}
+
+static void ggml_compute_forward_get_rows_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_compute_forward_diag
+
+static void ggml_compute_forward_diag_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(ne00 == ne0);
+    GGML_ASSERT(ne00 == ne1);
+    GGML_ASSERT(ne01 == 1);
+    GGML_ASSERT(ne02 == ne2);
+    GGML_ASSERT(ne03 == ne3);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = 0; i2 < ne2; i2++) {
+            for (int i1 = 0; i1 < ne1; i1++) {
+                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
+                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
+                for (int i0 = 0; i0 < i1; i0++) {
+                    d[i0] = 0;
+                }
+                d[i1] = s[i1];
+                for (int i0 = i1+1; i0 < ne0; i0++) {
+                    d[i0] = 0;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_diag(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_diag_mask_inf
+
+static void ggml_compute_forward_diag_mask_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst,
+        const float value) {
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int  n_past  = ((int32_t *) dst->op_params)[0];
+    const bool inplace = src0->data == dst->data;
+
+    GGML_ASSERT(n_past >= 0);
+
+    if (!inplace && (params->type == GGML_TASK_INIT)) {
+        // memcpy needs to be synchronized across threads to avoid race conditions.
+        // => do it in INIT phase
+        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+        GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+        memcpy(
+            ((char *)  dst->data),
+            ((char *) src0->data),
+            ggml_nbytes(dst));
+    }
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+    const int nr = src0->ne[1];
+    const int nz = n/nr;
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int k = 0; k < nz; k++) {
+        for (int j = ith; j < nr; j += nth) {
+            for (int i = n_past; i < nc; i++) {
+                if (i > n_past + j) {
+                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_diag_mask_inf(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_diag_mask_zero(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_soft_max
+
+static void ggml_compute_forward_soft_max_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float *dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(sp[i]));
+        }
+#endif
+
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, sp);
+
+        ggml_float sum = 0.0;
+
+        uint16_t scvt;
+        for (int i = 0; i < nc; i++) {
+            if (sp[i] == -INFINITY) {
+                dp[i] = 0.0f;
+            } else {
+                // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
+                ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
+                memcpy(&scvt, &s, sizeof(scvt));
+                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                sum += (ggml_float)val;
+                dp[i] = val;
+            }
+        }
+
+        assert(sum > 0.0);
+
+        sum = 1.0/sum;
+        ggml_vec_scale_f32(nc, dp, sum);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dp[i]));
+            assert(!isinf(dp[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_soft_max_back
+
+static void ggml_compute_forward_soft_max_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_are_same_shape(src1, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(dy[i]));
+            assert(!isnan(y[i]));
+        }
+#endif
+        // Jii = yi - yi*yi
+        // Jij = -yi*yj
+        // J = diag(y)-y.T*y
+        // dx = J * dy
+        // dxk = sum_i(Jki * dyi)
+        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
+        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
+        // dxk = -yk * dot(y, dy) + yk*dyk
+        // dxk = yk * (- dot(y, dy) + dyk)
+        // dxk = yk * (dyk - dot(y, dy))
+        //
+        // post-order:
+        // dot_y_dy := dot(y, dy)
+        // dx := dy
+        // dx := dx - dot_y_dy
+        // dx := dx * y
+
+        // linear runtime, no additional memory
+        float dot_y_dy = 0;
+        ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
+        ggml_vec_cpy_f32 (nc, dx, dy);
+        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
+        ggml_vec_mul_f32 (nc, dx, dx, y);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dx[i]));
+            assert(!isinf(dx[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_alibi
+
+static void ggml_compute_forward_alibi_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    assert(n_past >= 0);
+
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int n  = ggml_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    //const int nb3 = src0->nb[3];
+
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(n_head == ne2);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
+                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                float m_k;
+
+                if (k < n_heads_log2_floor) {
+                    m_k = powf(m0, k + 1);
+                } else {
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+
+                pdst[0] = i * m_k + src[0];
+
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_alibi_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int n  = ggml_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    //const int nb3 = src0->nb[3];
+
+    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
+    //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(n_head == ne2);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
+                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                      float *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                float m_k;
+
+                if (k < n_heads_log2_floor) {
+                    m_k = powf(m0, k + 1);
+                } else {
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+
+                // we return F32
+                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_alibi(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_alibi_f16(params, src0, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_alibi_f32(params, src0, dst);
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_Q8_K:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_clamp
+
+static void ggml_compute_forward_clamp_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    float min;
+    float max;
+    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    for (int j = ith; j < n; j += nth) {
+        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
+        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
+
+        for (int i = 0; i < nc; i++) {
+            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
+        }
+    }
+}
+
+static void ggml_compute_forward_clamp(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_clamp_f32(params, src0, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_Q8_K:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_rope
+
+static void ggml_compute_forward_rope_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    float freq_base;
+    float freq_scale;
+
+    // these two only relevant for xPos RoPE:
+    float xpos_base;
+    bool  xpos_down;
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3];
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta = freq_scale * (float)p;
+
+                if (is_glm) {
+                    theta = MIN(p, n_ctx - 2);
+                    float block_theta = MAX(p - (n_ctx - 2), 0);
+                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
+                        const float cos_block_theta = cosf(block_theta);
+                        const float sin_block_theta = sinf(block_theta);
+
+                        theta *= theta_scale;
+                        block_theta *= theta_scale;
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[n_dims/2];
+                        const float x2 = src[n_dims];
+                        const float x3 = src[n_dims/2*3];
+
+                        dst_data[0]          = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims/2]   = x0*sin_theta + x1*cos_theta;
+                        dst_data[n_dims]     = x2*cos_block_theta - x3*sin_block_theta;
+                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
+                    }
+                } else if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
+                        // zeta scaling for xPos only:
+                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
+                        if (xpos_down) zeta = 1.0f / zeta;
+
+                        theta *= theta_scale;
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[1];
+
+                        dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
+                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
+                    }
+                } else {
+                    // TODO: this might be wrong for ne0 != n_dims - need double check
+                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);
+
+                            theta *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims/2];
+
+                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rope_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    float freq_base;
+    float freq_scale;
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3];
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta = freq_scale * (float)p;
+
+                if (is_glm) {
+                    theta = MIN(p, n_ctx - 2);
+                    float block_theta = MAX(p - (n_ctx - 2), 0);
+                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
+                        const float cos_block_theta = cosf(block_theta);
+                        const float sin_block_theta = sinf(block_theta);
+
+                        theta *= theta_scale;
+                        block_theta *= theta_scale;
+
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+                        const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
+                        const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
+
+                        dst_data[0]          = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims/2]   = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
+                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
+                    }
+                } if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
+
+                        theta *= theta_scale;
+
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[1]);
+
+                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    }
+                } else {
+                    // TODO: this might be wrong for ne0 != n_dims - need double check
+                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);
+
+                            theta *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float x0 = GGML_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+
+                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rope(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_rope_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rope_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_rope_back
+
+static void ggml_compute_forward_rope_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // y = rope(x, src1)
+    // dx = rope_back(dy, src1)
+    // src0 is dy, src1 contains options
+
+    float freq_base;
+    float freq_scale;
+
+    // these two only relevant for xPos RoPE:
+    float xpos_base;
+    bool xpos_down;
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    assert(nb0 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    const bool is_neox = mode & 2;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta = freq_scale * (float)p;
+
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
+                        // zeta scaling for xPos only:
+                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
+                        if (xpos_down) zeta = 1.0f / zeta;
+
+                        theta *= theta_scale;
+
+                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float dy0 = dy[0];
+                        const float dy1 = dy[1];
+
+                        dx[0] =   dy0*cos_theta*zeta + dy1*sin_theta*zeta;
+                        dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
+                    }
+                } else {
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);
+
+                            theta *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float dy0 = dy[0];
+                            const float dy1 = dy[n_dims/2];
+
+                            dx[0]        =   dy0*cos_theta + dy1*sin_theta;
+                            dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rope_back_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // y = rope(x, src1)
+    // dx = rope_back(dy, src1)
+    // src0 is dy, src1 contains options
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    assert(nb0 == sizeof(ggml_fp16_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+
+    const bool is_neox = mode & 2;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float theta = (float)p;
+
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
+
+                        theta *= theta_scale;
+
+                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float dy0 = GGML_FP16_TO_FP32(dy[0]);
+                        const float dy1 = GGML_FP16_TO_FP32(dy[1]);
+
+                        dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
+                        dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
+                    }
+                } else {
+                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);
+
+                            theta *= theta_scale;
+
+                            const int64_t i0 = ib*n_dims + ic/2;
+
+                            const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                                  ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                            const float dy0 = GGML_FP16_TO_FP32(dy[0]);
+                            const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
+
+                            dx[0]        = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
+                            dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rope_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_conv_1d
+
+static void ggml_compute_forward_conv_1d_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00;
+
+    // size of the convolution row - the kernel size unrolled across all input channels
+    const int ew0 = nk*ne01;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+        for (int64_t i11 = 0; i11 < ne11; i11++) {
+            const float * const src = (float *)((char *) src1->data + i11*nb11);
+            ggml_fp16_t * dst_data = wdata;
+
+            for (int64_t i0 = 0; i0 < ne0; i0++) {
+                for (int64_t ik = 0; ik < nk; ik++) {
+                    const int idx0 = i0*s0 + ik*d0 - p0;
+
+                    if(!(idx0 < 0 || idx0 >= ne10)) {
+                        dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
+                    }
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // total rows in dst
+    const int nr = ne2;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+    for (int i2 = 0; i2 < ne2; i2++) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+            for (int i0 = 0; i0 < ne0; i0++) {
+                ggml_vec_dot_f16(ew0, dst_data + i0,
+                        (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
+                        (ggml_fp16_t *)                wdata + i2*nb2 + i0*ew0);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_1d_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00;
+
+    const int ew0 = nk*ne01;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        float * const wdata = (float *) params->wdata + 0;
+
+        for (int64_t i11 = 0; i11 < ne11; i11++) {
+            const float * const src = (float *)((char *) src1->data + i11*nb11);
+            float * dst_data = wdata;
+
+            for (int64_t i0 = 0; i0 < ne0; i0++) {
+                for (int64_t ik = 0; ik < nk; ik++) {
+                    const int idx0 = i0*s0 + ik*d0 - p0;
+
+                    if(!(idx0 < 0 || idx0 >= ne10)) {
+                        dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
+                    }
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // total rows in dst
+    const int nr = ne02;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * const wdata = (float *) params->wdata + 0;
+
+    for (int i2 = 0; i2 < ne2; i2++) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+            for (int i0 = 0; i0 < ne0; i0++) {
+                ggml_vec_dot_f32(ew0, dst_data + i0,
+                        (float *) ((char *) src0->data + i1*nb02),
+                        (float *)                wdata + i2*nb2 + i0*ew0);
+            }
+        }
+    }
+}
+
+// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
+static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
+                             ggml_fp16_t * A,
+                             ggml_fp16_t * B,
+                             float * C,
+                             const int ith, const int nth) {
+    // does not seem to make a difference
+    int64_t m0, m1, n0, n1;
+    // patches per thread
+    if (m > n) {
+        n0 = 0;
+        n1 = n;
+
+        // total patches in dst
+        const int np = m;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        m0 = dp*ith;
+        m1 = MIN(m0 + dp, np);
+    } else {
+        m0 = 0;
+        m1 = m;
+
+        // total patches in dst
+        const int np = n;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        n0 = dp*ith;
+        n1 = MIN(n0 + dp, np);
+    }
+
+    // block-tiling attempt
+    int64_t blck_n = 16;
+    int64_t blck_m = 16;
+
+    // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
+    // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
+    // if (blck_size > 0) {
+    //     blck_0 = 4;
+    //     blck_1 = blck_size / blck_0;
+    //     if (blck_1 < 0) {
+    //         blck_1 = 1;
+    //     }
+    //     // blck_0 = (int64_t)sqrt(blck_size);
+    //     // blck_1 = blck_0;
+    // }
+    // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
+
+    for (int j = n0; j < n1; j+=blck_n) {
+        for (int i = m0; i < m1; i+=blck_m) {
+            // printf("i j k => %d %d %d\n", i, j, K);
+            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
+                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
+                    ggml_vec_dot_f16(k,
+                                    C + ii*n + jj,
+                                    A + ii * k,
+                                    B + jj * k);
+                }
+            }
+        }
+    }
+}
+
+// src0: kernel [OC, IC, K]
+// src1: signal [N, IC, IL]
+// dst:  result [N, OL, IC*K]
+static void ggml_compute_forward_conv_1d_stage_0_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int64_t N  = ne12;
+    const int64_t IC = ne11;
+    const int64_t IL = ne10;
+
+    const int64_t K = ne00;
+
+    const int64_t OL = ne1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // im2col: [N, IC, IL] => [N, OL, IC*K]
+    {
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t iol = 0; iol < OL; iol++) {
+                for (int64_t iic = ith; iic < IC; iic+=nth) {
+
+                    // micro kernel
+                    ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
+                    const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
+
+                    for (int64_t ik = 0; ik < K; ik++) {
+                        const int64_t iil = iol*s0 + ik*d0 - p0;
+
+                        if (!(iil < 0 || iil >= IL)) {
+                            dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// src0: [OC, IC, K]
+// src1: [N, OL, IC * K]
+// result: [N, OC, OL]
+static void ggml_compute_forward_conv_1d_stage_1_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    const int N = ne12;
+    const int OL = ne11;
+
+    const int OC = ne02;
+    const int IC = ne01;
+    const int K  = ne00;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t m = OC;
+    int64_t n = OL;
+    int64_t k = IC * K;
+
+    // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+    for (int i = 0; i < N; i++) {
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m, n]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+    }
+}
+
+static void ggml_compute_forward_conv_1d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch(src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_conv_1d_stage_0(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch(src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_conv_1d_stage_1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch(src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_conv_transpose_1d
+
+static void ggml_compute_forward_conv_transpose_1d_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (L x Cin) to (Cin x L)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            ggml_fp16_t * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
+    ggml_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f16(ne02, &v,
+                        (ggml_fp16_t *)    wdata_src + i1n,
+                        (ggml_fp16_t *) wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_transpose_1d_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            float * const wdata = (float *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    float * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // prepare source data (src1)
+        {
+            float * const wdata = (float *) params->wdata + nk;
+            float * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = src[i10];
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * const wdata     = (float *) params->wdata + 0;
+    float * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        float * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f32(ne02, &v,
+                        wdata_src + i1n,
+                        wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_transpose_1d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_conv_2d
+
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void ggml_compute_forward_conv_2d_stage_0_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int64_t N = ne13;
+    const int64_t IC = ne12;
+    const int64_t IH = ne11;
+    const int64_t IW = ne10;
+
+    // const int64_t OC = ne03;
+    // const int64_t IC = ne02;
+    const int64_t KH = ne01;
+    const int64_t KW = ne00;
+
+    const int64_t OH = ne2;
+    const int64_t OW = ne1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) {
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic+=nth) {
+
+                        // micro kernel
+                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+// src0: [OC, IC, KH, KW]
+// src1: [N, OH, OW, IC * KH * KW]
+// result: [N, OC, OH, OW]
+static void ggml_compute_forward_conv_2d_stage_1_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    const int N = ne13;
+    const int OH = ne12;
+    const int OW = ne11;
+
+    const int OC = ne03;
+    const int IC = ne02;
+    const int KH = ne01;
+    const int KW = ne00;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t m = OC;
+    int64_t n = OH * OW;
+    int64_t k = IC * KH * KW;
+
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    for (int i = 0; i < N; i++) {
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m, n]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+    }
+}
+
+static void ggml_compute_forward_conv_2d_f16_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // src1: image [N, IC, IH, IW]
+    // src0: kernel [OC, IC, KH, KW]
+    // dst:  result [N, OC, OH, OW]
+    // ne12: IC
+    // ne0: OW
+    // ne1: OH
+    // nk0: KW
+    // nk1: KH
+    // ne13: N
+
+    const int N = ne13;
+    const int IC = ne12;
+    const int IH = ne11;
+    const int IW = ne10;
+
+    const int OC = ne03;
+    // const int IC = ne02;
+    const int KH = ne01;
+    const int KW = ne00;
+
+    const int OH = ne1;
+    const int OW = ne0;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // const int nk0 = ne00;
+    // const int nk1 = ne01;
+
+    // size of the convolution row - the kernel size unrolled across all channels
+    // const int ew0 = nk0*nk1*ne02;
+    // ew0: IC*KH*KW
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        // prepare source data (src1)
+        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
+
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+            for (int in = 0; in < N; in++) {
+                for (int iic = 0; iic < IC; iic++) {
+                    for (int ioh = 0; ioh < OH; ioh++) {
+                        for (int iow = 0; iow < OW; iow++) {
+
+                            // micro kernel
+                            ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+
+                            for (int ikh = 0; ikh < KH; ikh++) {
+                                for (int ikw = 0; ikw < KW; ikw++) {
+                                    const int iiw = iow*s0 + ikw*d0 - p0;
+                                    const int iih = ioh*s1 + ikh*d1 - p1;
+
+                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+    // wdata: [N*OH*OW, IC*KH*KW]
+    // dst: result [N, OC, OH, OW]
+    // src0: kernel [OC, IC, KH, KW]
+
+    int64_t m = OC;
+    int64_t n = OH * OW;
+    int64_t k = IC * KH * KW;
+
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    for (int i = 0; i < N; i++) {
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+        ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m * k]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+    }
+}
+
+static void ggml_compute_forward_conv_2d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
+                GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_conv_2d_stage_0(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_conv_2d_stage_1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_conv_transpose_2d
+
+static void ggml_compute_forward_conv_transpose_2d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02*ne03;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
+                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
+                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
+                        }
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            for (int i12 = 0; i12 < ne12; i12++) {
+                for (int i11 = 0; i11 < ne11; i11++) {
+                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
+                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
+                    for (int i10 = 0; i10 < ne10; i10++) {
+                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
+                    }
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int32_t stride = ggml_get_op_params_i32(dst, 0);
+
+    // total patches in dst
+    const int np = ne2;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+    ggml_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
+        float * dst_data = (float *)((char *) dst->data + i2*nb2);
+        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
+        for (int i11 = 0; i11 < ne11; i11++) {
+            for (int i10 = 0; i10 < ne10; i10++) {
+                const int i1n = i11*ne10*ne12 + i10*ne12;
+                for (int i01 = 0; i01 < ne01; i01++) {
+                    for (int i00 = 0; i00 < ne00; i00++) {
+                        float v = 0;
+                        ggml_vec_dot_f16(ne03, &v,
+                                wdata_src + i1n,
+                                wdata_kernel + i01*ne00*ne03 + i00*ne03);
+                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_pool_1d_sk_p0
+
+static void ggml_compute_forward_pool_1d_sk_p0(
+        const struct ggml_compute_params * params,
+        const enum ggml_op_pool op,
+        const struct ggml_tensor * src,
+        const int k,
+        struct ggml_tensor * dst) {
+    assert(src->type == GGML_TYPE_F32);
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const char * cdata = (const char *)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+    float * drow = (float *)dst->data;
+
+    const int64_t rs = dst->ne[0];
+
+    while (cdata < data_end) {
+        const float * const srow = (const float *)cdata;
+
+        int j = 0;
+
+        for (int64_t i = 0; i < rs; ++i) {
+            switch (op) {
+                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
+                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
+                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+            }
+            for (int ki = 0; ki < k; ++ki) {
+                switch (op) {
+                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
+                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
+                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
+                }
+                ++j;
+            }
+            switch (op) {
+                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
+                case GGML_OP_POOL_MAX:                       break;
+                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+            }
+        }
+
+        cdata += src->nb[1];
+        drow  += rs;
+    }
+}
+
+// ggml_compute_forward_pool_1d
+
+static void ggml_compute_forward_pool_1d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+              struct ggml_tensor * dst) {
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int s0 = opts[2];
+    const int p0 = opts[3];
+    GGML_ASSERT(p0 == 0); // padding not supported
+    GGML_ASSERT(k0 == s0); // only s = k supported
+
+    ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
+}
+
+// ggml_compute_forward_pool_2d_sk_p0
+
+static void ggml_compute_forward_pool_2d_sk_p0(
+        const struct ggml_compute_params * params,
+        const enum   ggml_op_pool op,
+        const struct ggml_tensor * src,
+        const int k0,
+        const int k1,
+        struct ggml_tensor * dst) {
+    assert(src->type == GGML_TYPE_F32);
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const char * cdata = (const char*)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+
+    const int64_t px = dst->ne[0];
+    const int64_t py = dst->ne[1];
+    const int64_t pa = px * py;
+
+    float * dplane = (float *)dst->data;
+
+    const int ka = k0 * k1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            float * const drow = dplane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                float * const out =  drow + ox;
+                switch (op) {
+                    case GGML_OP_POOL_AVG:     *out = 0;        break;
+                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
+                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                }
+
+                const int ix = ox * k0;
+                const int iy = oy * k1;
+
+                for (int ky = 0; ky < k1; ++ky) {
+                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
+                    for (int kx = 0; kx < k0; ++kx) {
+                        int j = ix + kx;
+                        switch (op) {
+                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
+                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
+                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
+                        }
+                    }
+                }
+                switch (op) {
+                    case GGML_OP_POOL_AVG:           *out /= ka; break;
+                    case GGML_OP_POOL_MAX:                       break;
+                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                }
+            }
+        }
+
+        cdata  += src->nb[2];
+        dplane += pa;
+    }
+}
+
+// ggml_compute_forward_pool_2d
+
+static void ggml_compute_forward_pool_2d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+              struct ggml_tensor * dst) {
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(p1 == 0); // padding not supported
+    GGML_ASSERT(k0 == s0);
+    GGML_ASSERT(k1 == s1); // only s = k supported
+
+    ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
+}
+
+// ggml_compute_forward_upscale
+
+static void ggml_compute_forward_upscale_f32(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    struct ggml_tensor * dst) {
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int scale_factor = dst->op_params[0];
+
+    // TODO: optimize
+
+    for (int i03 = 0; i03 < ne03; i03++) {
+        for (int i02 = ith; i02 < ne02; i02++) {
+            for (int m = 0; m < dst->ne[1]; m++) {
+                int i01 = m / scale_factor;
+                for (int n = 0; n < dst->ne[0]; n++) {
+                    int i00 = n / scale_factor;
+
+                    const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
+
+                    *y = *x;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_upscale(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_upscale_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_flash_attn
+
+static void ggml_compute_forward_flash_attn_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const bool masked,
+        struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
+
+    GGML_ASSERT(ne0 == D);
+    GGML_ASSERT(ne1 == N);
+    GGML_ASSERT(P >= 0);
+
+    GGML_ASSERT(nbq0 == sizeof(float));
+    GGML_ASSERT(nbk0 == sizeof(float));
+    GGML_ASSERT(nbv0 == sizeof(float));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev1 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nek1 == N + P);
+    GGML_ASSERT(nev1 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by q rows using ggml_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
+
+        for (int i = M; i < Mup; ++i) {
+            S[i] = -INFINITY;
+        }
+
+        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+        for (int64_t ic = 0; ic < masked_begin; ++ic) {
+            // k indices
+            const int ik3 = iq3;
+            const int ik2 = iq2 % nek2;
+            const int ik1 = ic;
+
+            // S indices
+            const int i1 = ik1;
+
+            ggml_vec_dot_f32(neq0,
+                    S + i1,
+                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+        }
+
+        // scale
+        ggml_vec_scale_f32(masked_begin, S, scale);
+
+        for (int64_t i = masked_begin; i < M; i++) {
+            S[i] = -INFINITY;
+        }
+
+        // softmax
+        // exclude known -INF S[..] values from max and loop
+        // dont forget to set their SW values to zero
+        {
+            float max = -INFINITY;
+            ggml_vec_max_f32(masked_begin, &max, S);
+
+            ggml_float sum = 0.0;
+            {
+#ifdef GGML_SOFT_MAX_ACCELERATE
+                max = -max;
+                vDSP_vsadd(S, 1, &max, S, 1, Mup);
+                vvexpf(S, S, &Mup);
+                ggml_vec_sum_f32(Mup, &sum, S);
+#else
+                uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
+                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
+
+                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                    if (i >= masked_begin) {
+                        break;
+                    }
+                    float * SS = S + i;
+
+                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
+                        if (i + j >= masked_begin) {
+                            break;
+                        } else if (SS[j] == -INFINITY) {
+                            SS[j] = 0.0f;
+                        } else {
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+                            const float val = expf(SS[j] - max);
+#else
+                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
+                            memcpy(&scvt[j], &s, sizeof(uint16_t));
+                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
+                            sump[j] += (ggml_float)val;
+                            SS[j] = val;
+                        }
+                    }
+                }
+
+                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
+                    sum += sump[i];
+                }
+#endif
+            }
+
+            assert(sum > 0.0);
+
+            sum = 1.0/sum;
+            ggml_vec_scale_f32(masked_begin, S, sum);
+
+#ifndef NDEBUG
+            for (int i = 0; i < masked_begin; ++i) {
+                assert(!isnan(S[i]));
+                assert(!isinf(S[i]));
+            }
+#endif
+        }
+
+        for (int64_t ic = 0; ic < nev1; ++ic) {
+            // dst indices
+            const int i1 = iq1;
+            const int i2 = iq2;
+            const int i3 = iq3;
+
+            // v indices
+            const int iv2 = iq2 % nev2;
+            const int iv3 = iq3;
+
+            ggml_vec_dot_f32(masked_begin,
+                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                    S);
+        }
+    }
+}
+
+static void ggml_compute_forward_flash_attn_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const bool masked,
+        struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
+
+    GGML_ASSERT(ne0 == D);
+    GGML_ASSERT(ne1 == N);
+    GGML_ASSERT(P >= 0);
+
+    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev1 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nek1 == N + P);
+    GGML_ASSERT(nev1 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by q rows using ggml_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
+
+        for (int i = M; i < Mup; ++i) {
+            S[i] = -INFINITY;
+        }
+
+        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
+            for (int64_t ic = 0; ic < nek1; ++ic) {
+                // k indices
+                const int ik3 = iq3;
+                const int ik2 = iq2 % nek2;
+                const int ik1 = ic;
+
+                // S indices
+                const int i1 = ik1;
+
+                ggml_vec_dot_f16(neq0,
+                        S + i1,
+                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+            }
+        } else {
+            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
+                // k indices
+                const int ik3 = iq3;
+                const int ik2 = iq2 % nek2;
+                const int ik1 = ic;
+
+                // S indices
+                const int i1 = ik1;
+
+                ggml_vec_dot_f16_unroll(neq0, nbk1,
+                        S + i1,
+                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+            }
+        }
+
+        // scale
+        ggml_vec_scale_f32(nek1, S, scale);
+
+        if (masked) {
+            for (int64_t i = P; i < M; i++) {
+                if (i > P + iq1) {
+                    S[i] = -INFINITY;
+                }
+            }
+        }
+
+        // softmax
+        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
+        // dont forget to set their S values to zero
+        {
+            float max = -INFINITY;
+            ggml_vec_max_f32(M, &max, S);
+
+            ggml_float sum = 0.0;
+            {
+#ifdef GGML_SOFT_MAX_ACCELERATE
+                max = -max;
+                vDSP_vsadd(S, 1, &max, S, 1, Mup);
+                vvexpf(S, S, &Mup);
+                ggml_vec_sum_f32(Mup, &sum, S);
+#else
+                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
+                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
+
+                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                    float * SS = S + i;
+
+                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
+                        if (SS[j] == -INFINITY) {
+                            SS[j] = 0.0f;
+                        } else {
+                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
+                            memcpy(&scvt[j], &s, sizeof(uint16_t));
+                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                            sump[j] += (ggml_float)val;
+                            SS[j] = val;
+                        }
+                    }
+                }
+
+                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
+                    sum += sump[i];
+                }
+#endif
+            }
+
+            assert(sum > 0.0);
+
+            sum = 1.0/sum;
+            ggml_vec_scale_f32(M, S, sum);
+
+#ifndef NDEBUG
+            for (int i = 0; i < M; ++i) {
+                assert(!isnan(S[i]));
+                assert(!isinf(S[i]));
+            }
+#endif
+        }
+
+        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
+
+        for (int64_t i = 0; i < M; i++) {
+            S16[i] = GGML_FP32_TO_FP16(S[i]);
+        }
+
+        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
+        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
+            for (int64_t ic = 0; ic < nev1; ++ic) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
+
+                // v indices
+                const int iv2 = iq2 % nev2;
+                const int iv3 = iq3;
+
+                ggml_vec_dot_f16(nev0,
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                        S16);
+            }
+        } else {
+            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
+
+                // v indices
+                const int iv2 = iq2 % nev2;
+                const int iv3 = iq3;
+
+                ggml_vec_dot_f16_unroll(nev0, nbv1,
+                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                        S16);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_flash_attn(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const bool masked,
+        struct ggml_tensor * dst) {
+    switch (q->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_flash_ff
+
+static void ggml_compute_forward_flash_ff_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,  // F16
+        const struct ggml_tensor * b0, // F16 fc_w
+        const struct ggml_tensor * b1, // F32 fc_b
+        const struct ggml_tensor * c0, // F16 proj_w
+        const struct ggml_tensor * c1, // F32 proj_b
+        struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb)
+    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
+    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
+    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
+    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
+    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
+    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
+    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
+    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = nea0;
+    //const int64_t N = nea1;
+    const int64_t M = neb01;
+
+    GGML_ASSERT(ne0 == nea0);
+    GGML_ASSERT(ne1 == nea1);
+    GGML_ASSERT(ne2 == nea2);
+
+    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nbb10 == sizeof(float));
+    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nbc10 == sizeof(float));
+
+    GGML_ASSERT(neb00 == D);
+    GGML_ASSERT(neb01 == M);
+    GGML_ASSERT(neb10 == M);
+    GGML_ASSERT(neb11 == 1);
+
+    GGML_ASSERT(nec00 == M);
+    GGML_ASSERT(nec01 == D);
+    GGML_ASSERT(nec10 == D);
+    GGML_ASSERT(nec11 == 1);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by a rows using ggml_vec_dot_f32
+
+    // total rows in a
+    const int nr = nea1*nea2*nea3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // a indices
+        const int ia3 = ir/(nea2*nea1);
+        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
+        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
+
+        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
+
+        for (int64_t ic = 0; ic < neb01; ++ic) {
+            // b0 indices
+            const int ib03 = ia3;
+            const int ib02 = ia2;
+            const int ib01 = ic;
+
+            // S indices
+            const int i1 = ib01;
+
+            ggml_vec_dot_f16(nea0,
+                    S + i1,
+                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
+                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)));
+        }
+
+        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
+        //ggml_vec_gelu_f32(neb01, S, S);
+
+        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
+
+        for (int64_t i = 0; i < M; i++) {
+            S16[i] = GGML_FP32_TO_FP16(S[i]);
+        }
+
+        ggml_vec_gelu_f16(neb01, S16, S16);
+
+        {
+            // dst indices
+            const int i1 = ia1;
+            const int i2 = ia2;
+            const int i3 = ia3;
+
+            for (int64_t ic = 0; ic < nec01; ++ic) {
+
+                ggml_vec_dot_f16(neb01,
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)),
+                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)),
+                        S16);
+            }
+
+            ggml_vec_add_f32(nec01,
+                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
+                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
+                    (float *) c1->data);
+        }
+    }
+}
+
+static void ggml_compute_forward_flash_ff(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+        const struct ggml_tensor * b0,
+        const struct ggml_tensor * b1,
+        const struct ggml_tensor * c0,
+        const struct ggml_tensor * c1,
+        struct ggml_tensor * dst) {
+    switch (b0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(false); // TODO
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_flash_attn_back
+
+static void ggml_compute_forward_flash_attn_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * d,
+        const bool masked,
+              struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
+    const int mxDM = MAX(D, Mup);
+
+    // GGML_ASSERT(ne0 == D);
+    // GGML_ASSERT(ne1 == N);
+    GGML_ASSERT(P >= 0);
+
+    GGML_ASSERT(nbq0 == sizeof(float));
+    GGML_ASSERT(nbk0 == sizeof(float));
+    GGML_ASSERT(nbv0 == sizeof(float));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned0 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nek1 == N + P);
+    GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned1 == N);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_TASK_INIT) {
+        if (ith == 0) {
+            memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
+        }
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int64_t elem_q = ggml_nelements(q);
+    const int64_t elem_k = ggml_nelements(k);
+
+    enum ggml_type result_type = dst->type;
+    GGML_ASSERT(ggml_blck_size(result_type) == 1);
+    const size_t tsize = ggml_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+
+    void * grad_q = (char *) dst->data;
+    void * grad_k = (char *) dst->data + offs_k;
+    void * grad_v = (char *) dst->data + offs_v;
+
+    const size_t nbgq1 = nb0*neq0;
+    const size_t nbgq2 = nb0*neq0*neq1;
+    const size_t nbgq3 = nb0*neq0*neq1*neq2;
+
+    const size_t nbgk1 = nb0*nek0;
+    const size_t nbgk2 = nb0*nek0*nek1;
+    const size_t nbgk3 = nb0*nek0*nek1*neq2;
+
+    const size_t nbgv1 = nb0*nev0;
+    const size_t nbgv2 = nb0*nev0*nev1;
+    const size_t nbgv3 = nb0*nev0*nev1*neq2;
+
+    // parallelize by k rows using ggml_vec_dot_f32
+
+    // total rows in k
+    const int nr = nek2*nek3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    // how often k2 (and v2) is repeated in q2
+    int nrep = neq2/nek2;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int ik3 = ir/(nek2);
+        const int ik2 = ir - ik3*nek2;
+
+        const int iq3 = ik3;
+        const int id3 = ik3;
+        const int iv3 = ik3;
+        const int iv2 = ik2;
+
+        for (int irep = 0; irep < nrep; ++irep) {
+            const int iq2 = ik2 + irep*nek2;
+            const int id2 = iq2;
+
+            // (ik2 + irep*nek2) % nek2 == ik2
+            for (int iq1 = 0; iq1 < neq1; ++iq1) {
+                const int id1 = iq1;
+
+                // not sure about CACHE_LINE_SIZE_F32..
+                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
+                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
+                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
+
+                for (int i = M; i < Mup; ++i) {
+                    S[i] = -INFINITY;
+                }
+
+                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    // k indices
+                    const int ik1 = ic;
+
+                    // S indices
+                    const int i1 = ik1;
+
+                    ggml_vec_dot_f32(neq0,
+                            S + i1,
+                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+                }
+
+                // scale
+                ggml_vec_scale_f32(masked_begin, S, scale);
+
+                for (int64_t i = masked_begin; i < M; i++) {
+                    S[i] = -INFINITY;
+                }
+
+                // softmax
+                // exclude known -INF S[..] values from max and loop
+                // dont forget to set their SM values to zero
+                {
+                    float max = -INFINITY;
+                    ggml_vec_max_f32(masked_begin, &max, S);
+
+                    ggml_float sum = 0.0;
+                    {
+#ifdef GGML_SOFT_MAX_ACCELERATE
+                        max = -max;
+                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
+                        vvexpf(SM, SM, &Mup);
+                        ggml_vec_sum_f32(Mup, &sum, SM);
+#else
+                        uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
+                        ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
+
+                        for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                            if (i >= masked_begin) {
+                                break;
+                            }
+                            float * SR =  S + i;
+                            float * SW = SM + i;
+
+                            for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
+                                if (i + j >= masked_begin) {
+                                    break;
+                                } else if (SR[j] == -INFINITY) {
+                                    SW[j] = 0.0f;
+                                } else {
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+                                    const float val = expf(SR[j] - max);
+#else
+                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
+                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
+                                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
+                                    sump[j] += (ggml_float)val;
+                                    SW[j] = val;
+                                }
+                            }
+                        }
+
+                        for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
+                            sum += sump[i];
+                        }
+#endif
+                    }
+
+                    assert(sum > 0.0);
+
+                    sum = 1.0/sum;
+                    ggml_vec_scale_f32(masked_begin, SM, sum);
+
+                }
+
+                // step-by-step explanation
+                {
+                    // forward-process                    shape      grads from backward process
+                    // parallel_for ik2,ik3:
+                    //  for irep:
+                    //   iq2 = ik2 + irep*nek2
+                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
+                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
+                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
+                    //   for iq1:
+                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
+                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
+                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
+                    //    S0     = -Inf                   [D,1,1,1]
+                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
+                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
+                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
+                    //   ~S5[i]  = dot(vcur[:,i], S4)
+                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
+                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
+                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
+                    // dst                               backward-/ grad[dst]                 = d
+                    //
+                    // output gradients with their dependencies:
+                    //
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S4]   = grad[S5] @ vcur
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[vcur] = grad[S5].T @ S4
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // in post-order:
+                    //
+                    // S1         = qcur @ kcur.T
+                    // S2         = S1 * scale
+                    // S3         = diag_mask_inf(S2, P)
+                    // S4         = softmax(S3)
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // using less variables (SM=S4):
+                    //
+                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
+                    // SM            = softmax(S)
+                    // S             = d[:D,iq1,iq2,iq3] @ vcur
+                    // dot_SM_gradSM = dot(SM, S)
+                    // S             = SM * (S - dot(SM, S))
+                    // S             = diag_mask_zero(S, P) * scale
+                    //
+                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
+                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
+                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
+                }
+
+                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // for ic:
+                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
+                // exclude known future zero S[..] values from operation
+                ggml_vec_set_f32(masked_begin, S, 0);
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(masked_begin,
+                            S,
+                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
+                }
+
+                // S = SM * (S - dot(SM, S))
+                float dot_SM_gradSM = 0;
+                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
+                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
+                ggml_vec_mul_f32 (masked_begin, S, S, SM);
+
+                // S = diag_mask_zero(S, P) * scale
+                // already done by above ggml_vec_set_f32
+
+                // exclude known zero S[..] values from operation
+                ggml_vec_scale_f32(masked_begin, S, scale);
+
+                // S    shape [M,1]
+                // SM   shape [M,1]
+                // kcur shape [D,M]
+                // qcur shape [D,1]
+                // vcur shape [M,D]
+
+                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
+                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
+                // for ic:
+                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
+                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
+                            S[ic]);
+                }
+
+                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
+                // for ic:
+                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
+                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
+                            S[ic]);
+                }
+
+                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
+                // for ic:
+                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
+                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
+                // exclude known zero SM[..] values from mad
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(masked_begin,
+                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
+                            SM,
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_flash_attn_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * d,
+        const bool masked,
+        struct ggml_tensor * dst) {
+    switch (q->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_win_part
+
+static void ggml_compute_forward_win_part_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
+
+    assert(ne00 == ne0);
+    assert(ne3  == nep0*nep1);
+
+    // TODO: optimize / multi-thread
+    for (int py = 0; py < nep1; ++py) {
+        for (int px = 0; px < nep0; ++px) {
+            const int64_t i3 = py*nep0 + px;
+            for (int64_t i2 = 0; i2 < ne2; ++i2) {
+                for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                        const int64_t i02 = py*w + i2;
+                        const int64_t i01 = px*w + i1;
+                        const int64_t i00 = i0;
+
+                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
+                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
+
+                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
+                            ((float *) dst->data)[i] = 0.0f;
+                        } else {
+                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_win_part(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_win_part_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_win_unpart
+
+static void ggml_compute_forward_win_unpart_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t w = ((const int32_t *)(dst->op_params))[0];
+
+    // padding
+    const int px = (w - ne1%w)%w;
+    //const int py = (w - ne2%w)%w;
+
+    const int npx = (px + ne1)/w;
+    //const int npy = (py + ne2)/w;
+
+    assert(ne0 == ne00);
+
+    // TODO: optimize / multi-thread
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int ip2 = i2/w;
+                const int ip1 = i1/w;
+
+                const int64_t i02 = i2%w;
+                const int64_t i01 = i1%w;
+                const int64_t i00 = i0;
+
+                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
+                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
+
+                ((float *) dst->data)[j] = ((float *) src0->data)[i];
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_win_unpart(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_win_unpart_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+//gmml_compute_forward_unary
+
+static void ggml_compute_forward_unary(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    const enum ggml_unary_op op = ggml_get_unary_op(dst);
+
+    switch (op) {
+        case GGML_UNARY_OP_ABS:
+            {
+                ggml_compute_forward_abs(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_SGN:
+            {
+                ggml_compute_forward_sgn(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_NEG:
+            {
+                ggml_compute_forward_neg(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_STEP:
+            {
+                ggml_compute_forward_step(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_TANH:
+            {
+                ggml_compute_forward_tanh(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_ELU:
+            {
+                ggml_compute_forward_elu(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_RELU:
+            {
+                ggml_compute_forward_relu(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_GELU:
+            {
+                ggml_compute_forward_gelu(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_GELU_QUICK:
+            {
+                ggml_compute_forward_gelu_quick(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_SILU:
+            {
+                ggml_compute_forward_silu(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_get_rel_pos
+
+static void ggml_compute_forward_get_rel_pos_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int64_t w = ne1;
+
+    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
+    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            const int64_t pos = (w - i1 - 1) + i2;
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_get_rel_pos(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_add_rel_pos
+
+static void ggml_compute_forward_add_rel_pos_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+        struct ggml_tensor * dst) {
+
+    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
+    if (!inplace && params->type == GGML_TASK_INIT) {
+        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
+        return;
+    }
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
+
+    float * src1_data = (float *) src1->data;
+    float * src2_data = (float *) src2->data;
+    float * dst_data  = (float *) dst->data;
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // total patches in dst
+    const int np = ne13;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
+        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+            for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
+                for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                    const int64_t jp0  = jp1 + i10;
+                    const float src1_e = src1_data[jp0];
+                    const float src2_e = src2_data[jp0];
+
+                    const int64_t jdh = jp0 * ne10;
+                    const int64_t jdw = jdh - (ne10 - 1) * i10;
+
+                    for (int64_t j = 0; j < ne10; ++j) {
+                        dst_data[jdh + j     ] += src2_e;
+                        dst_data[jdw + j*ne10] += src1_e;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_add_rel_pos(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_map_unary
+
+static void ggml_compute_forward_map_unary_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_map_unary(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_map_binary
+
+static void ggml_compute_forward_map_binary_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+    assert(src1->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])),
+                (float *) ((char *) src1->data + i*(src1->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_map_binary(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_map_custom1
+
+static void ggml_compute_forward_map_custom1_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+        struct ggml_tensor * dst,
+        const ggml_custom1_op_f32_t fun) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    fun(dst, a);
+}
+
+// ggml_compute_forward_map_custom2
+
+static void ggml_compute_forward_map_custom2_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+        const struct ggml_tensor * b,
+        struct ggml_tensor * dst,
+        const ggml_custom2_op_f32_t fun) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    fun(dst, a, b);
+}
+
+// ggml_compute_forward_map_custom3
+
+static void ggml_compute_forward_map_custom3_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+        const struct ggml_tensor * b,
+        const struct ggml_tensor * c,
+        struct ggml_tensor * dst,
+        const ggml_custom3_op_f32_t fun) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    fun(dst, a, b, c);
+}
+
+// ggml_compute_forward_map_custom1
+
+static void ggml_compute_forward_map_custom1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+              struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
+
+    p->fun(dst, a, params->ith, params->nth, p->userdata);
+}
+
+// ggml_compute_forward_map_custom2
+
+static void ggml_compute_forward_map_custom2(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+        const struct ggml_tensor * b,
+              struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
+
+    p->fun(dst, a, b, params->ith, params->nth, p->userdata);
+}
+
+// ggml_compute_forward_map_custom3
+
+static void ggml_compute_forward_map_custom3(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * a,
+        const struct ggml_tensor * b,
+        const struct ggml_tensor * c,
+              struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
+
+    p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
+}
+
+// ggml_compute_forward_cross_entropy_loss
+
+static void ggml_compute_forward_cross_entropy_loss_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    float * sums = (float *) params->wdata;
+
+    // TODO: handle transposed/permuted matrices
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
+
+    if (params->type == GGML_TASK_INIT) {
+        if (ith == 0) {
+            memset(sums, 0, sizeof(float) * (nth + nth * nc));
+        }
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        if (ith == 0) {
+            float * dp = (float *) dst->data;
+            ggml_vec_sum_f32(nth, dp, sums);
+            dp[0] *= -1.0f / (float) nr;
+        }
+        return;
+    }
+
+    const double eps = 1e-9;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float * st = ((float *) params->wdata) + nth + ith*nc;
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+        // soft_max
+        ggml_float sum = 0.0;
+        {
+            float max = -INFINITY;
+            ggml_vec_max_f32(nc, &max, s0);
+
+            uint16_t scvt; UNUSED(scvt);
+            for (int i = 0; i < nc; i++) {
+                if (s0[i] == -INFINITY) {
+                    st[i] = 0.0f;
+                } else {
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
+                    memcpy(&scvt, &s, sizeof(scvt));
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
+                    sum += (ggml_float)val;
+                    st[i] = val;
+                }
+            }
+
+            assert(sum > 0.0);
+            // sum = 1.0/sum;
+        }
+        // avoid log(0) by rescaling from [0..1] to [eps..1]
+        sum = (1.0 - eps) / sum;
+        ggml_vec_scale_f32(nc, st, sum);
+        ggml_vec_add1_f32(nc, st, st, eps);
+        ggml_vec_log_f32(nc, st, st);
+        ggml_vec_mul_f32(nc, st, st, s1);
+
+        float st_sum = 0;
+        ggml_vec_sum_f32(nc, &st_sum, st);
+        sums[ith] += st_sum;
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(st[i]));
+            assert(!isinf(st[i]));
+        }
+#endif
+    }
+
+}
+
+static void ggml_compute_forward_cross_entropy_loss(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_cross_entropy_loss_back
+
+static void ggml_compute_forward_cross_entropy_loss_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(opt0));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const double eps = 1e-9;
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0->ne[0];
+    const int64_t nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    float * d   = (float *) opt0->data;
+
+    for (int64_t i1 = ir0; i1 < ir1; i1++) {
+        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
+        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+
+        // soft_max
+        ggml_float sum = 0.0;
+        {
+            float max = -INFINITY;
+            ggml_vec_max_f32(nc, &max, s0);
+
+            uint16_t scvt; UNUSED(scvt);
+            for (int i = 0; i < nc; i++) {
+                if (s0[i] == -INFINITY) {
+                    ds0[i] = 0.0f;
+                } else {
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
+                    memcpy(&scvt, &s, sizeof(scvt));
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
+                    sum += (ggml_float)val;
+                    ds0[i] = val;
+                }
+            }
+
+            assert(sum > 0.0);
+            sum = (1.0 - eps)/sum;
+        }
+
+        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
+        ggml_vec_scale_f32(nc, ds0, sum);
+        ggml_vec_add1_f32(nc, ds0, ds0, eps);
+        ggml_vec_sub_f32(nc, ds0, ds0, s1);
+        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(ds0[i]));
+            assert(!isinf(ds0[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_cross_entropy_loss_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+/////////////////////////////////
+
+static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    GGML_ASSERT(params);
+
+#ifdef GGML_USE_CUBLAS
+    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
+    if (skip_cpu) {
+        return;
+    }
+    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
+#endif // GGML_USE_CUBLAS
+
+    switch (tensor->op) {
+        case GGML_OP_DUP:
+            {
+                ggml_compute_forward_dup(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_ADD:
+            {
+                ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_ADD1:
+            {
+                ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_ACC:
+            {
+                ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_SUB:
+            {
+                ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_MUL:
+            {
+                ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_DIV:
+            {
+                ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_SQR:
+            {
+                ggml_compute_forward_sqr(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_SQRT:
+            {
+                ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_LOG:
+            {
+                ggml_compute_forward_log(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_SUM:
+            {
+                ggml_compute_forward_sum(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_SUM_ROWS:
+            {
+                ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_MEAN:
+            {
+                ggml_compute_forward_mean(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_ARGMAX:
+            {
+                ggml_compute_forward_argmax(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_REPEAT:
+            {
+                ggml_compute_forward_repeat(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_REPEAT_BACK:
+            {
+                ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_CONCAT:
+            {
+                ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_SILU_BACK:
+            {
+                ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_NORM:
+            {
+                ggml_compute_forward_norm(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_RMS_NORM:
+            {
+                ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_RMS_NORM_BACK:
+            {
+                ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_GROUP_NORM:
+            {
+                ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_OUT_PROD:
+            {
+                ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_SCALE:
+            {
+                ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_SET:
+            {
+                ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_compute_forward_cpy(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_CONT:
+            {
+                ggml_compute_forward_cont(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_RESHAPE:
+            {
+                ggml_compute_forward_reshape(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_VIEW:
+            {
+                ggml_compute_forward_view(params, tensor->src[0]);
+            } break;
+        case GGML_OP_PERMUTE:
+            {
+                ggml_compute_forward_permute(params, tensor->src[0]);
+            } break;
+        case GGML_OP_TRANSPOSE:
+            {
+                ggml_compute_forward_transpose(params, tensor->src[0]);
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_GET_ROWS_BACK:
+            {
+                ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_DIAG:
+            {
+                ggml_compute_forward_diag(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_DIAG_MASK_INF:
+            {
+                ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+            {
+                ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_SOFT_MAX_BACK:
+            {
+                ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_ROPE:
+            {
+                ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_ROPE_BACK:
+            {
+                ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_ALIBI:
+            {
+                ggml_compute_forward_alibi(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                ggml_compute_forward_clamp(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_CONV_1D:
+            {
+                ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_1D_STAGE_0:
+            {
+                ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_1D_STAGE_1:
+            {
+                ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_2D:
+            {
+                ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_2D_STAGE_0:
+            {
+                ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_2D_STAGE_1:
+            {
+                ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_POOL_1D:
+            {
+                ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_UPSCALE:
+            {
+                ggml_compute_forward_upscale(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_FLASH_ATTN:
+            {
+                const int32_t t = ggml_get_op_params_i32(tensor, 0);
+                GGML_ASSERT(t == 0 || t == 1);
+                const bool masked = t != 0;
+                ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
+            } break;
+        case GGML_OP_FLASH_FF:
+            {
+                ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
+            } break;
+        case GGML_OP_FLASH_ATTN_BACK:
+            {
+                int32_t t = ggml_get_op_params_i32(tensor, 0);
+                GGML_ASSERT(t == 0 || t == 1);
+                bool masked = t != 0;
+                ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
+            } break;
+        case GGML_OP_WIN_PART:
+            {
+                ggml_compute_forward_win_part(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_WIN_UNPART:
+            {
+                ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_UNARY:
+            {
+                ggml_compute_forward_unary(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_GET_REL_POS:
+            {
+                ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_ADD_REL_POS:
+            {
+                ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            } break;
+        case GGML_OP_MAP_UNARY:
+            {
+                ggml_unary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_BINARY:
+            {
+                ggml_binary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1_F32:
+            {
+                ggml_custom1_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2_F32:
+            {
+                ggml_custom2_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3_F32:
+            {
+                ggml_custom3_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1:
+            {
+                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2:
+            {
+                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3:
+            {
+                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            }
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            {
+                ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
+            }
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            }
+            break;
+        case GGML_OP_NONE:
+            {
+                // nop
+            } break;
+        case GGML_OP_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static size_t hash_find(void * hash_table[], void * p) {
+    size_t h = hash(p);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i] != NULL && hash_table[i] != p) {
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_GRAPH_HASHTABLE_SIZE;
+        }
+    }
+    return i;
+}
+
+static bool hash_insert(void * hash_table[], void * p) {
+    size_t i = hash_find(hash_table, p);
+
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+
+    if (hash_table[i] == p) {
+        return true;
+    }
+
+    // insert
+    GGML_ASSERT(hash_table[i] == NULL);
+    hash_table[i] = p;
+    return false;
+}
+
+static bool hash_contains(void * hash_table[], void * p) {
+    size_t i = hash_find(hash_table, p);
+    return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
+}
+
+struct hash_map {
+    void * keys[GGML_GRAPH_HASHTABLE_SIZE];
+    void * vals[GGML_GRAPH_HASHTABLE_SIZE];
+};
+
+static struct hash_map * new_hash_map(void) {
+    struct hash_map * result = malloc(sizeof(struct hash_map));
+    for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
+        result->keys[i] = NULL;
+        result->vals[i] = NULL;
+    }
+    return result;
+}
+
+static void free_hash_map(struct hash_map * map) {
+    free(map);
+}
+
+// gradient checkpointing
+
+static struct ggml_tensor * ggml_recompute_graph_node(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * graph,
+        struct hash_map     * replacements,
+        struct ggml_tensor  * node) {
+
+    if (node == NULL) {
+        return NULL;
+    }
+
+    if (node->is_param) {
+        return node;
+    }
+
+    if (!hash_contains(graph->visited_hash_table, node)) {
+        return node;
+    }
+
+    int count_children = 0;
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        if (node->src[k]) {
+            ++count_children;
+        }
+    }
+
+    if (count_children == 0) {
+        return node;
+    }
+
+    size_t i = hash_find(replacements->keys, node);
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+    if (replacements->keys[i] == node) {
+        return (struct ggml_tensor *) replacements->vals[i];
+    }
+
+    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
+
+    // insert clone into replacements
+    GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+    replacements->keys[i] = node;
+    replacements->vals[i] = clone;
+
+    clone->op       = node->op;
+    clone->grad     = node->grad;
+    clone->is_param = node->is_param;
+    clone->extra    = node->extra;
+    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
+        clone->nb[k] = node->nb[k];
+    }
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
+    }
+    if (node->view_src != NULL) {
+        clone->data = (node->view_src->data == NULL)
+                        ? NULL // view_src not yet allocated
+                        : (char *) node->view_src->data // view_src already allocated
+                                 + node->view_offs;
+        clone->view_src  = node->view_src;
+        clone->view_offs = node->view_offs;
+    }
+
+    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
+    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
+    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
+    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
+
+    return clone;
+}
+
+void ggml_build_backward_gradient_checkpointing(
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * checkpoints,
+        int                     n_checkpoints) {
+    *gb_tmp = *gf;
+    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
+
+    if (n_checkpoints <= 0) {
+        *gb = *gb_tmp;
+        return;
+    }
+
+    struct hash_map * replacements = new_hash_map();
+
+    // insert checkpoints in replacements
+    for (int i = 0; i < n_checkpoints; ++i) {
+        size_t k = hash_find(replacements->keys, checkpoints[i]);
+        GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+        GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
+        replacements->keys[k] = checkpoints[i];
+        replacements->vals[k] = checkpoints[i];
+    }
+
+    *gb = *gf;
+    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
+    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
+    // by recomputing them from checkpoints
+    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
+        struct ggml_tensor * node = gb_tmp->nodes[i];
+        for (int k = 0; k < GGML_MAX_SRC; ++k) {
+            // insert new tensors recomputing src, reusing already made replacements,
+            // remember replacements: remember new tensors with mapping from corresponding gf nodes
+            // recurse for input tensors,
+            // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
+            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
+        }
+        // insert rewritten backward node with replacements made into resulting backward graph gb
+        ggml_build_forward_expand(gb, node);
+    }
+
+    free_hash_map(replacements);
+}
+
+// functions to change gradients considering the case that input a might be initial gradient with zero value
+
+static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        return b;
+    } else {
+        return ggml_add_impl(ctx, a, b, false);
+    }
+}
+
+static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
+        return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
+    } else {
+        return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+    }
+}
+
+static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        return ggml_repeat(ctx, b, a);
+    } else {
+        return ggml_add1_impl(ctx, a, b, false);
+    }
+}
+
+static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        return ggml_neg(ctx, b);
+    } else {
+        return ggml_sub_impl(ctx, a, b, false);
+    }
+}
+
+static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
+
+    switch (tensor->op) {
+        case GGML_OP_DUP:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_OP_ADD:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_OP_ADD1:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad = ggml_add_or_set(ctx,
+                        src1->grad,
+                        ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
+                        zero_table);
+                }
+            } break;
+        case GGML_OP_ACC:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    const size_t nb1     = ((int32_t *) tensor->op_params)[0];
+                    const size_t nb2     = ((int32_t *) tensor->op_params)[1];
+                    const size_t nb3     = ((int32_t *) tensor->op_params)[2];
+                    const size_t offset  = ((int32_t *) tensor->op_params)[3];
+
+                    struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
+                        tensor->grad,
+                        src1->grad->ne[0],
+                        src1->grad->ne[1],
+                        src1->grad->ne[2],
+                        src1->grad->ne[3],
+                        nb1, nb2, nb3, offset);
+
+                    src1->grad =
+                        ggml_add_or_set(ctx,
+                            src1->grad,
+                            ggml_reshape(ctx,
+                                ggml_cont(ctx, tensor_grad_view),
+                                src1->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_SUB:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_OP_MUL:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                                src0->grad,
+                                ggml_mul(ctx, src1, tensor->grad),
+                                zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_add_or_set(ctx,
+                                src1->grad,
+                                ggml_mul(ctx, src0, tensor->grad),
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_DIV:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                                src0->grad,
+                                ggml_div(ctx, tensor->grad, src1),
+                                zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_sub_or_set(ctx,
+                                src1->grad,
+                                ggml_mul(ctx,
+                                    tensor->grad,
+                                    ggml_div(ctx, tensor, src1)),
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_SQR:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                                src0->grad,
+                                ggml_scale(ctx,
+                                    ggml_mul(ctx, src0, tensor->grad),
+                                    ggml_new_f32(ctx, 2.0f)),
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_SQRT:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                                src0->grad,
+                                ggml_scale(ctx,
+                                    ggml_div(ctx,
+                                        tensor->grad,
+                                        tensor),
+                                    ggml_new_f32(ctx, 0.5f)),
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_LOG:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                                src0->grad,
+                                ggml_div(ctx,
+                                    tensor->grad,
+                                    src0),
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_SUM:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add1_or_set(ctx,
+                                src0->grad,
+                                tensor->grad,
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_SUM_ROWS:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                                src0->grad,
+                                ggml_repeat(ctx,
+                                    tensor->grad,
+                                    src0->grad),
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_MEAN:
+        case GGML_OP_ARGMAX:
+            {
+                GGML_ASSERT(false); // TODO: implement
+            } break;
+        case GGML_OP_REPEAT:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx,
+                            src0->grad,
+                            ggml_repeat_back(ctx, tensor->grad, src0->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_REPEAT_BACK:
+            {
+                if (src0->grad) {
+                    // TODO: test this
+                    src0->grad = ggml_add_or_set(ctx,
+                            src0->grad,
+                            ggml_repeat(ctx, tensor->grad, src0->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_CONCAT:
+            {
+                GGML_ASSERT(false); // TODO: implement
+            } break;
+        case GGML_OP_SILU_BACK:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_NORM:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_RMS_NORM:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    float eps;
+                    memcpy(&eps, tensor->op_params, sizeof(float));
+
+                    src0->grad = ggml_add_or_set(ctx,
+                            src0->grad,
+                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_RMS_NORM_BACK:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_GROUP_NORM:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                // https://cs231n.github.io/optimization-2/#staged
+                // # forward pass
+                // s0 = np.random.randn(5, 10)
+                // s1 = np.random.randn(10, 3)
+                // t = s0.dot(s1)
+
+                // # now suppose we had the gradient on t from above in the circuit
+                // dt = np.random.randn(*t.shape) # same shape as t
+                // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
+                // ds1 = t.T.dot(dt)
+
+                // tensor.shape [m,p,qq,rr]
+                // src0.shape   [n,m,q1,r1]
+                // src1.shape   [n,p,qq,rr]
+
+                // necessary for llama
+                if (src0->grad) {
+                    struct ggml_tensor * s1_tg =
+                        ggml_out_prod(ctx, // [n,m,qq,rr]
+                            src1,          // [n,p,qq,rr]
+                            tensor->grad); // [m,p,qq,rr]
+                    const int64_t qq = s1_tg->ne[2];
+                    const int64_t rr = s1_tg->ne[3];
+                    const int64_t q1 = src0->ne[2];
+                    const int64_t r1 = src0->ne[3];
+                    const bool ne2_broadcasted = qq > q1;
+                    const bool ne3_broadcasted = rr > r1;
+                    if (ne2_broadcasted || ne3_broadcasted) {
+                        // sum broadcast repetitions of s1_tg into shape of src0
+                        s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
+                    }
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                                src0->grad, // [n,m,q1,r1]
+                                s1_tg,      // [n,m,q1,r1]
+                                zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_add_or_set(ctx,
+                                src1->grad,                            // [n,p,qq,rr]
+                                // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
+                                //     ggml_cont(ctx,                  // [m,n,q1,r1]
+                                //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
+                                //     tensor->grad),                  // [m,p,qq,rr]
+
+                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
+                                // // avoid transpose of src0, rather transpose smaller tensor->grad
+                                // // and then use ggml_out_prod
+                                ggml_out_prod(ctx,                  // [n,p,qq,rr]
+                                    src0,                           // [n,m,q1,r1]
+                                    ggml_transpose(ctx,             // [p,m,qq,rr]
+                                        tensor->grad)),             // [m,p,qq,rr]
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_OUT_PROD:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_SCALE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx,
+                            src0->grad,
+                            ggml_scale_impl(ctx, tensor->grad, src1, false),
+                            zero_table);
+                }
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_add_or_set(ctx,
+                            src1->grad,
+                            ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_SET:
+            {
+                const size_t nb1     = ((int32_t *) tensor->op_params)[0];
+                const size_t nb2     = ((int32_t *) tensor->op_params)[1];
+                const size_t nb3     = ((int32_t *) tensor->op_params)[2];
+                const size_t offset  = ((int32_t *) tensor->op_params)[3];
+
+                struct ggml_tensor * tensor_grad_view = NULL;
+
+                if (src0->grad || src1->grad) {
+                    GGML_ASSERT(src0->type == tensor->type);
+                    GGML_ASSERT(tensor->grad->type == tensor->type);
+                    GGML_ASSERT(tensor->grad->type == src1->grad->type);
+
+                    tensor_grad_view = ggml_view_4d(ctx,
+                        tensor->grad,
+                        src1->grad->ne[0],
+                        src1->grad->ne[1],
+                        src1->grad->ne[2],
+                        src1->grad->ne[3],
+                        nb1, nb2, nb3, offset);
+                }
+
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx,
+                        src0->grad,
+                        ggml_acc_impl(ctx,
+                            tensor->grad,
+                            ggml_neg(ctx, tensor_grad_view),
+                            nb1, nb2, nb3, offset, false),
+                        zero_table);
+                }
+
+                if (src1->grad) {
+                    src1->grad =
+                        ggml_add_or_set(ctx,
+                            src1->grad,
+                            ggml_reshape(ctx,
+                                ggml_cont(ctx, tensor_grad_view),
+                                src1->grad),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_CPY:
+            {
+                // necessary for llama
+                // cpy overwrites value of src1 by src0 and returns view(src1)
+                // the overwriting is mathematically equivalent to:
+                // tensor = src0 * 1 + src1 * 0
+                if (src0->grad) {
+                    // dsrc0 = dtensor * 1
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+                if (src1->grad) {
+                    // dsrc1 = dtensor * 0 -> noop
+                }
+            } break;
+        case GGML_OP_CONT:
+            {
+                // same as cpy
+                if (src0->grad) {
+                    GGML_ASSERT(ggml_is_contiguous(src0->grad));
+                    GGML_ASSERT(ggml_is_contiguous(tensor->grad));
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                }
+            } break;
+        case GGML_OP_RESHAPE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx, src0->grad,
+                            ggml_reshape(ctx,
+                                ggml_is_contiguous(tensor->grad)
+                                    ? tensor->grad
+                                    : ggml_cont(ctx, tensor->grad),
+                                src0->grad),
+                        zero_table);
+                }
+            } break;
+        case GGML_OP_VIEW:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    size_t offset;
+
+                    memcpy(&offset, tensor->op_params, sizeof(offset));
+
+                    size_t nb1     = tensor->nb[1];
+                    size_t nb2     = tensor->nb[2];
+                    size_t nb3     = tensor->nb[3];
+
+                    if (src0->type != src0->grad->type) {
+                        // gradient is typically F32, but src0 could be other type
+                        size_t ng = ggml_element_size(src0->grad);
+                        size_t n0 = ggml_element_size(src0);
+                        GGML_ASSERT(offset % n0 == 0);
+                        GGML_ASSERT(nb1 % n0 == 0);
+                        GGML_ASSERT(nb2 % n0 == 0);
+                        GGML_ASSERT(nb3 % n0 == 0);
+                        offset = (offset / n0) * ng;
+                        nb1 = (nb1 / n0) * ng;
+                        nb2 = (nb2 / n0) * ng;
+                        nb3 = (nb3 / n0) * ng;
+                    }
+
+                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
+                }
+            } break;
+        case GGML_OP_PERMUTE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    int32_t * axes = (int32_t *) tensor->op_params;
+                    int axis0 = axes[0] & 0x3;
+                    int axis1 = axes[1] & 0x3;
+                    int axis2 = axes[2] & 0x3;
+                    int axis3 = axes[3] & 0x3;
+                    int axes_backward[4] = {0,0,0,0};
+                    axes_backward[axis0] = 0;
+                    axes_backward[axis1] = 1;
+                    axes_backward[axis2] = 2;
+                    axes_backward[axis3] = 3;
+                    src0->grad =
+                        ggml_add_or_set(ctx, src0->grad,
+                            ggml_permute(ctx,
+                                tensor->grad,
+                                axes_backward[0],
+                                axes_backward[1],
+                                axes_backward[2],
+                                axes_backward[3]),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_TRANSPOSE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx, src0->grad,
+                            ggml_transpose(ctx, tensor->grad),
+                        zero_table);
+                }
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                // necessary for llama (only for tokenizer)
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx, src0->grad,
+                            // last ggml_get_rows_back argument src0->grad is only
+                            // necessary to setup correct output shape
+                            ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
+                        zero_table);
+                }
+                if (src1->grad) {
+                    // noop
+                }
+            } break;
+        case GGML_OP_GET_ROWS_BACK:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_DIAG:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_DIAG_MASK_INF:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    const int n_past = ((int32_t *) tensor->op_params)[0];
+                    src0->grad =
+                        ggml_add_or_set(ctx, src0->grad,
+                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+                        zero_table);
+                }
+            } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    const int n_past = ((int32_t *) tensor->op_params)[0];
+                    src0->grad =
+                        ggml_add_or_set(ctx, src0->grad,
+                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+                        zero_table);
+                }
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    src0->grad =
+                        ggml_add_or_set(ctx, src0->grad,
+                            ggml_soft_max_back(ctx, tensor->grad, tensor),
+                        zero_table);
+                }
+
+            } break;
+        case GGML_OP_SOFT_MAX_BACK:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_ROPE:
+            {
+                // necessary for llama
+                if (src0->grad) {
+                    //const int n_past = ((int32_t *) tensor->op_params)[0];
+                    const int n_dims = ((int32_t *) tensor->op_params)[1];
+                    const int mode   = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
+                    float freq_base;
+                    float freq_scale;
+                    float xpos_base;
+                    bool  xpos_down;
+                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
+                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
+                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+
+                    src0->grad = ggml_add_or_set(ctx,
+                            src0->grad,
+                            ggml_rope_back(ctx,
+                                tensor->grad,
+                                src1,
+                                n_dims,
+                                mode,
+                                n_ctx,
+                                freq_base,
+                                freq_scale,
+                                xpos_base,
+                                xpos_down),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_ROPE_BACK:
+            {
+                if (src0->grad) {
+                    //const int n_past = ((int32_t *) tensor->op_params)[0];
+                    const int n_dims = ((int32_t *) tensor->op_params)[1];
+                    const int mode   = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
+                    float freq_base;
+                    float freq_scale;
+                    float xpos_base;
+                    bool  xpos_down;
+                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
+                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
+                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+
+                    src0->grad = ggml_add_or_set(ctx,
+                            src0->grad,
+                            ggml_rope_impl(ctx,
+                                tensor->grad,
+                                src1,
+                                n_dims,
+                                mode,
+                                n_ctx,
+                                freq_base,
+                                freq_scale,
+                                xpos_base,
+                                xpos_down,
+                                false),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_ALIBI:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_1D_STAGE_0:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_1D_STAGE_1:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_2D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_2D_STAGE_0:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_2D_STAGE_1:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_POOL_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_UPSCALE:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_FLASH_ATTN:
+            {
+                struct ggml_tensor * flash_grad = NULL;
+                if (src0->grad || src1->grad || tensor->src[2]->grad) {
+                    int32_t t = ggml_get_op_params_i32(tensor, 0);
+                    GGML_ASSERT(t == 0 || t == 1);
+                    bool masked = t != 0;
+                    flash_grad =
+                        ggml_flash_attn_back(ctx,
+                            src0,
+                            src1,
+                            tensor->src[2],
+                            tensor->grad,
+                            masked);
+                }
+
+                struct ggml_tensor * src2 = tensor->src[2];
+                const int64_t elem_q = ggml_nelements(src0);
+                const int64_t elem_k = ggml_nelements(src1);
+                const int64_t elem_v = ggml_nelements(src2);
+
+                enum ggml_type result_type = flash_grad->type;
+                GGML_ASSERT(ggml_blck_size(result_type) == 1);
+                const size_t tsize = ggml_type_size(result_type);
+
+                const size_t offs_q = 0;
+                const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+                const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+
+                if (src0->grad) {
+                    struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q);
+                    struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0);
+                    src0->grad = ggml_add_or_set(ctx,
+                            src0->grad,
+                            grad_q,
+                            zero_table);
+                }
+                if (src1->grad) {
+                    struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
+                    struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1);
+                    src1->grad = ggml_add_or_set(ctx,
+                            src1->grad,
+                            grad_k,
+                            zero_table);
+                }
+                if (src2->grad) {
+                    struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
+                    struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2);
+                    src2->grad = ggml_add_or_set(ctx,
+                            src2->grad,
+                            grad_v,
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_FLASH_FF:
+            {
+                GGML_ASSERT(false); // not supported
+            } break;
+        case GGML_OP_FLASH_ATTN_BACK:
+            {
+                GGML_ASSERT(false); // not supported
+            } break;
+        case GGML_OP_WIN_PART:
+        case GGML_OP_WIN_UNPART:
+        case GGML_OP_UNARY:
+            {
+                switch (ggml_get_unary_op(tensor)) {
+                    case GGML_UNARY_OP_ABS:
+                        {
+                            if (src0->grad) {
+                                src0->grad =
+                                    ggml_add_or_set(ctx,
+                                            src0->grad,
+                                            ggml_mul(ctx,
+                                                ggml_sgn(ctx, src0),
+                                                tensor->grad),
+                                            zero_table);
+                            }
+                        } break;
+                    case GGML_UNARY_OP_SGN:
+                        {
+                            if (src0->grad) {
+                                // noop
+                            }
+                        } break;
+                    case GGML_UNARY_OP_NEG:
+                        {
+                            if (src0->grad) {
+                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
+                            }
+                        } break;
+                    case GGML_UNARY_OP_STEP:
+                        {
+                            if (src0->grad) {
+                                // noop
+                            }
+                        } break;
+                    case GGML_UNARY_OP_TANH:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_ELU:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_RELU:
+                        {
+                            if (src0->grad) {
+                                src0->grad = ggml_add_or_set(ctx,
+                                        src0->grad,
+                                        ggml_mul(ctx,
+                                            ggml_step(ctx, src0),
+                                            tensor->grad),
+                                        zero_table);
+                            }
+                        } break;
+                    case GGML_UNARY_OP_GELU:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_GELU_QUICK:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_SILU:
+                        {
+                            // necessary for llama
+                            if (src0->grad) {
+                                src0->grad = ggml_add_or_set(ctx,
+                                        src0->grad,
+                                        ggml_silu_back(ctx, src0, tensor->grad),
+                                        zero_table);
+                            }
+                        } break;
+                    default:
+                        GGML_ASSERT(false);
+                }
+            } break;
+        case GGML_OP_GET_REL_POS:
+        case GGML_OP_ADD_REL_POS:
+        case GGML_OP_MAP_UNARY:
+        case GGML_OP_MAP_BINARY:
+        case GGML_OP_MAP_CUSTOM1_F32:
+        case GGML_OP_MAP_CUSTOM2_F32:
+        case GGML_OP_MAP_CUSTOM3_F32:
+        case GGML_OP_MAP_CUSTOM1:
+        case GGML_OP_MAP_CUSTOM2:
+        case GGML_OP_MAP_CUSTOM3:
+            {
+                GGML_ASSERT(false); // not supported
+            } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_or_set(ctx,
+                                src0->grad,
+                                ggml_cross_entropy_loss_back(ctx,
+                                    src0,
+                                    src1,
+                                    tensor->grad),
+                                zero_table);
+                }
+            } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                GGML_ASSERT(false); // not supported
+            } break;
+        case GGML_OP_NONE:
+            {
+                // nop
+            } break;
+        case GGML_OP_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        if (tensor->src[i] && tensor->src[i]->grad) {
+            GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad));
+        }
+    }
+}
+
+static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
+    if (node->grad == NULL) {
+        // this usually happens when we generate intermediate nodes from constants in the backward pass
+        // it can also happen during forward pass, if the user performs computations with constants
+        if (node->op != GGML_OP_NONE) {
+            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
+        }
+    }
+
+    // check if already visited
+    if (hash_insert(cgraph->visited_hash_table, node)) {
+        return;
+    }
+
+    node->n_dst = 0;
+
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        const int k =
+            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
+            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
+            /* unknown order, just fall back to using i*/ i;
+        if (node->src[k]) {
+            ggml_visit_parents(cgraph, node->src[k]);
+            node->src[k]->n_dst++;
+        }
+    }
+
+    if (node->op == GGML_OP_NONE && node->grad == NULL) {
+        // reached a leaf node, not part of the gradient graph (e.g. a constant)
+        GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
+
+        if (strlen(node->name) == 0) {
+            ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
+        }
+
+        cgraph->leafs[cgraph->n_leafs] = node;
+        cgraph->n_leafs++;
+    } else {
+        GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
+
+        if (strlen(node->name) == 0) {
+            ggml_format_name(node, "node_%d", cgraph->n_nodes);
+        }
+
+        cgraph->nodes[cgraph->n_nodes] = node;
+        cgraph->grads[cgraph->n_nodes] = node->grad;
+        cgraph->n_nodes++;
+    }
+}
+
+static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
+    if (!expand) {
+        cgraph->n_nodes = 0;
+        cgraph->n_leafs = 0;
+    }
+
+    const int n0 = cgraph->n_nodes;
+    UNUSED(n0);
+
+    ggml_visit_parents(cgraph, tensor);
+
+    const int n_new = cgraph->n_nodes - n0;
+    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
+
+    if (n_new > 0) {
+        // the last added node should always be starting point
+        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
+    }
+}
+
+void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    ggml_build_forward_impl(cgraph, tensor, true);
+}
+
+struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
+    struct ggml_cgraph result = {
+        /*.n_nodes      =*/ 0,
+        /*.n_leafs      =*/ 0,
+        /*.nodes        =*/ { NULL },
+        /*.grads        =*/ { NULL },
+        /*.leafs        =*/ { NULL },
+        /*.hash_table   =*/ { NULL },
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+    };
+
+    ggml_build_forward_impl(&result, tensor, false);
+
+    return result;
+}
+
+void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
+    GGML_ASSERT(gf->n_nodes > 0);
+
+    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
+    if (keep) {
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_tensor * node = gf->nodes[i];
+
+            if (node->grad) {
+                node->grad = ggml_dup_tensor(ctx, node);
+                gf->grads[i] = node->grad;
+            }
+        }
+    }
+
+    // remember original gradients which start with zero values
+    void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
+    memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
+    for (int i = 0; i < gf->n_nodes; i++) {
+        if (gf->grads[i]) {
+            hash_insert(zero_table, gf->grads[i]);
+        }
+    }
+
+    for (int i = gf->n_nodes - 1; i >= 0; i--) {
+        struct ggml_tensor * node = gf->nodes[i];
+
+        // inplace operations to add gradients are not created by ggml_compute_backward
+        // use allocator to automatically make inplace operations
+        if (node->grad) {
+            ggml_compute_backward(ctx, node, zero_table);
+        }
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        struct ggml_tensor * node = gf->nodes[i];
+
+        if (node->is_param) {
+            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
+            ggml_build_forward_expand(gb, node->grad);
+        }
+    }
+
+    free(zero_table);
+}
+
+struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
+    struct ggml_cgraph result = *gf;
+    ggml_build_backward_expand(ctx, gf, &result, keep);
+    return result;
+}
+
+struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
+    struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
+
+    *cgraph = (struct ggml_cgraph) {
+        /*.n_nodes      =*/ 0,
+        /*.n_leafs      =*/ 0,
+        /*.nodes        =*/ { NULL },
+        /*.grads        =*/ { NULL },
+        /*.leafs        =*/ { NULL },
+        /*.hash_table   =*/ { NULL },
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+    };
+
+    return cgraph;
+}
+
+struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+    struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
+    ggml_build_forward_impl(cgraph, tensor, false);
+    return cgraph;
+}
+
+size_t ggml_graph_overhead(void) {
+    return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
+}
+
+//
+// thread data
+//
+// synchronization is done via busy loops
+// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
+//
+
+#ifdef __APPLE__
+
+//#include <os/lock.h>
+//
+//typedef os_unfair_lock ggml_lock_t;
+//
+//#define ggml_lock_init(x)    UNUSED(x)
+//#define ggml_lock_destroy(x) UNUSED(x)
+//#define ggml_lock_lock       os_unfair_lock_lock
+//#define ggml_lock_unlock     os_unfair_lock_unlock
+//
+//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
+
+typedef int ggml_lock_t;
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#define ggml_lock_lock(x)    UNUSED(x)
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+
+typedef pthread_t ggml_thread_t;
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
+
+//typedef pthread_spinlock_t ggml_lock_t;
+
+//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
+//#define ggml_lock_destroy pthread_spin_destroy
+//#define ggml_lock_lock    pthread_spin_lock
+//#define ggml_lock_unlock  pthread_spin_unlock
+
+typedef int ggml_lock_t;
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+
+typedef pthread_t ggml_thread_t;
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
+// Android's libc implementation "bionic" does not support setting affinity
+#if defined(__linux__) && !defined(__BIONIC__)
+static void set_numa_thread_affinity(int thread_n, int n_threads) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
+    // run thread on node_num thread_n / (threads per node)
+    const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
+    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (size_t i = 0; i < node->n_cpus; ++i) {
+        CPU_SET_S(node->cpus[i], setsize, cpus);
+    }
+
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+                    strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+
+static void clear_numa_thread_affinity(void) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
+        CPU_SET_S(i, setsize, cpus);
+    }
+
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+            strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+#else
+// TODO: Windows etc.
+// (the linux implementation may also work on BSD, someone should test)
+static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
+static void clear_numa_thread_affinity(void) {}
+#endif
+
+struct ggml_compute_state_shared {
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
+};
+
+struct ggml_compute_state {
+    ggml_thread_t thrd;
+    int ith;
+    struct ggml_compute_state_shared * shared;
+};
+
+static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
+    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
+    int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
+
+    node->perf_runs++;
+    node->perf_cycles  += cycles_cur;
+    node->perf_time_us += time_us_cur;
+}
+
+
+static void on_node_compute_start(struct ggml_tensor* node) {
+    // printf("on_node_compute_start %s dynamic %d not_own_data %d hold %d\n",
+    // GGML_OP_NAME[node->op], node->dynamic, node->not_own_data, node->dynamic_hold);
+    if (node->dynamic) {
+        if (node->not_own_data) {
+            //GGML_ASSERT(node->data == NULL);
+
+            if (node->op == GGML_OP_VIEW) {
+                size_t offset;
+                memcpy(&offset, node->op_params, sizeof(offset));
+                node->data = (void*)((char*)node->src[0]->data + offset);
+            }
+            else {
+                node->data = node->src[0]->data;
+            }
+
+        }
+        else {
+            if (node->data != NULL && node->dynamic_hold) {
+                return;
+            }
+            GGML_ASSERT(node->data == NULL);
+            node->data = GGML_DYNAMIC_MALLOC(ggml_nbytes(node));
+        }
+    }
+    // printf("on_node_compute_start done %s dynamic %d not_own_data %d hold %d\n",
+    // GGML_OP_NAME[node->op], node->dynamic, node->not_own_data, node->dynamic_hold);
+}
+
+static void on_node_all_dst_compute_done(struct ggml_tensor* node) {
+    if (node->not_own_data) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            struct ggml_tensor* curr = node->src[i];
+            if (curr) {
+                GGML_ASSERT(curr->n_dst_curr > 0);
+                curr->n_dst_curr--;
+                if (curr->n_dst_curr == 0) {
+                    on_node_all_dst_compute_done(curr);
+                }
+            }
+        }
+    }
+    else {
+        if (node->dynamic && !node->dynamic_hold) {
+            GGML_DYNAMIC_FREE(node->data);
+            node->data = NULL;
+        }
+    }
+}
+
+static void on_node_compute_done(struct ggml_tensor* node) {
+    // printf("on_node_compute_done %s dynamic %d not_own_data %d hold %d\n",
+    // GGML_OP_NAME[node->op], node->dynamic, node->not_own_data, node->dynamic_hold);
+    if (node->not_own_data) {
+        return;
+    }
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        struct ggml_tensor* curr = node->src[i];
+        if (curr) {
+            GGML_ASSERT(curr->n_dst_curr > 0);
+            curr->n_dst_curr--;
+            if (curr->n_dst_curr == 0) {
+                on_node_all_dst_compute_done(curr);
+            }
+        }
+    }
+}
+
+static thread_ret_t ggml_graph_compute_thread(void * data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+
+    const struct ggml_cgraph * cgraph = state->shared->cgraph;
+    const struct ggml_cplan  * cplan  = state->shared->cplan;
+
+    const int * n_tasks_arr = cplan->n_tasks;
+    const int   n_threads   = state->shared->n_threads;
+
+    set_numa_thread_affinity(state->ith, n_threads);
+
+    int node_n = -1;
+
+    while (true) {
+        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+            state->shared->node_n += 1;
+            return (thread_ret_t) GGML_EXIT_ABORTED;
+        }
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            // all other threads are finished and spinning
+            // do finalize and init here so we don't have synchronize again
+            struct ggml_compute_params params = {
+                /*.type  =*/ GGML_TASK_FINALIZE,
+                /*.ith   =*/ 0,
+                /*.nth   =*/ 0,
+                /*.wsize =*/ cplan->work_size,
+                /*.wdata =*/ cplan->work_data,
+            };
+
+            if (node_n != -1) {
+                /* FINALIZE */
+                struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
+                if (GGML_OP_HAS_FINALIZE[node->op]) {
+                    params.nth = n_tasks_arr[node_n];
+                    ggml_compute_forward(&params, node);
+                }
+                ggml_graph_compute_perf_stats_node(node, state->shared);
+
+                on_node_compute_done(node);
+            }
+
+            // distribute new work or execute it direct if 1T
+            while (++node_n < cgraph->n_nodes) {
+                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
+
+                struct ggml_tensor * node = cgraph->nodes[node_n];
+                const int n_tasks = n_tasks_arr[node_n];
+
+                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
+                state->shared->perf_node_start_time_us = ggml_perf_time_us();
+
+                params.nth = n_tasks;
+
+                /* INIT */
+                on_node_compute_start(node);
+                if (GGML_OP_HAS_INIT[node->op]) {
+                    params.type = GGML_TASK_INIT;
+                    ggml_compute_forward(&params, node);
+                }
+
+                if (n_tasks == 1) {
+                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
+                    // they do something more efficient than spinning (?)
+                    params.type = GGML_TASK_COMPUTE;
+                    ggml_compute_forward(&params, node);
+
+                    if (GGML_OP_HAS_FINALIZE[node->op]) {
+                        params.type = GGML_TASK_FINALIZE;
+                        ggml_compute_forward(&params, node);
+                    }
+
+                    ggml_graph_compute_perf_stats_node(node, state->shared);
+
+                    on_node_compute_done(node);
+                } else {
+                    break;
+                }
+
+                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+                    break;
+                }
+            }
+
+            atomic_store(&state->shared->n_active, n_threads);
+            atomic_store(&state->shared->node_n,   node_n);
+        } else {
+            // wait for other threads to finish
+            const int last = node_n;
+            while (true) {
+                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
+                //       depending on the workload and the operating system.
+                //       since it is not clear what is the best approach, it should potentially become user-configurable
+                //       ref: https://github.com/ggerganov/ggml/issues/291
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                sched_yield();
+#endif
+
+                node_n = atomic_load(&state->shared->node_n);
+                if (node_n != last) break;
+            };
+        }
+
+        // check if we should stop
+        if (node_n >= cgraph->n_nodes) break;
+
+        /* COMPUTE */
+        struct ggml_tensor * node = cgraph->nodes[node_n];
+        const int n_tasks = n_tasks_arr[node_n];
+
+        struct ggml_compute_params params = {
+            /*.type  =*/ GGML_TASK_COMPUTE,
+            /*.ith   =*/ state->ith,
+            /*.nth   =*/ n_tasks,
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
+        };
+
+        if (state->ith < n_tasks) {
+            ggml_compute_forward(&params, node);
+        }
+    }
+
+    return GGML_EXIT_SUCCESS;
+}
+
+struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
+    if (n_threads <= 0) {
+        n_threads = GGML_DEFAULT_N_THREADS;
+    }
+
+    size_t work_size = 0;
+
+    struct ggml_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_cplan));
+
+    // thread scheduling for the different operations + work buffer size estimation
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        int n_tasks = 1;
+
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        switch (node->op) {
+            case GGML_OP_CPY:
+            case GGML_OP_DUP:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+                    if (ggml_is_quantized(node->type)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ADD:
+            case GGML_OP_ADD1:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ACC:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SUB:
+            case GGML_OP_DIV:
+            case GGML_OP_SQR:
+            case GGML_OP_SQRT:
+            case GGML_OP_LOG:
+            case GGML_OP_SUM:
+            case GGML_OP_SUM_ROWS:
+            case GGML_OP_MEAN:
+            case GGML_OP_ARGMAX:
+            case GGML_OP_REPEAT:
+            case GGML_OP_REPEAT_BACK:
+            {
+                    n_tasks = 1;
+                } break;
+
+            case GGML_OP_UNARY:
+                {
+                    switch (ggml_get_unary_op(node)) {
+                        case GGML_UNARY_OP_ABS:
+                        case GGML_UNARY_OP_SGN:
+                        case GGML_UNARY_OP_NEG:
+                        case GGML_UNARY_OP_STEP:
+                        case GGML_UNARY_OP_TANH:
+                        case GGML_UNARY_OP_ELU:
+                        case GGML_UNARY_OP_RELU:
+                            {
+                                n_tasks = 1;
+                            } break;
+
+                        case GGML_UNARY_OP_GELU:
+                        case GGML_UNARY_OP_GELU_QUICK:
+                        case GGML_UNARY_OP_SILU:
+                            {
+                                n_tasks = n_threads;
+                            } break;
+                    }
+                } break;
+            case GGML_OP_SILU_BACK:
+            case GGML_OP_MUL:
+            case GGML_OP_NORM:
+            case GGML_OP_RMS_NORM:
+            case GGML_OP_RMS_NORM_BACK:
+            case GGML_OP_GROUP_NORM:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONCAT:
+            case GGML_OP_MUL_MAT:
+                {
+                    n_tasks = n_threads;
+
+                    // TODO: use different scheduling for different matrix sizes
+                    //const int nr0 = ggml_nrows(node->src[0]);
+                    //const int nr1 = ggml_nrows(node->src[1]);
+
+                    //n_tasks = MIN(n_threads, MAX(1, nr0/128));
+                    //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+
+                    size_t cur = 0;
+                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
+
+#if defined(GGML_USE_CUBLAS)
+                    if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                    } else
+#elif defined(GGML_USE_CLBLAST)
+                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
+                    } else
+#endif
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                    if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        if (node->src[0]->type != GGML_TYPE_F32) {
+                            // here we need memory just for single 2D matrix from src0
+                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                        }
+                    } else
+#endif
+                    if (node->src[1]->type != vec_dot_type) {
+                        cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
+                    } else {
+                        cur = 0;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_OUT_PROD:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SCALE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_SET:
+            case GGML_OP_CONT:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+            case GGML_OP_GET_ROWS:
+            case GGML_OP_GET_ROWS_BACK:
+            case GGML_OP_DIAG:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_DIAG_MASK_ZERO:
+            case GGML_OP_DIAG_MASK_INF:
+            case GGML_OP_SOFT_MAX:
+            case GGML_OP_SOFT_MAX_BACK:
+            case GGML_OP_ROPE:
+            case GGML_OP_ROPE_BACK:
+            case GGML_OP_ADD_REL_POS:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_ALIBI:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CLAMP:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CONV_1D:
+                {
+                    n_tasks = n_threads;
+
+                    GGML_ASSERT(node->src[0]->ne[3] == 1);
+                    GGML_ASSERT(node->src[1]->ne[2] == 1);
+                    GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+                    const int64_t ne00 = node->src[0]->ne[0];
+                    const int64_t ne01 = node->src[0]->ne[1];
+                    const int64_t ne02 = node->src[0]->ne[2];
+
+                    const int64_t ne10 = node->src[1]->ne[0];
+                    const int64_t ne11 = node->src[1]->ne[1];
+
+                    const int64_t ne0 = node->ne[0];
+                    const int64_t ne1 = node->ne[1];
+                    const int64_t nk  = ne00;
+                    const int64_t ew0 = nk * ne01;
+
+                    UNUSED(ne02);
+                    UNUSED(ne10);
+                    UNUSED(ne11);
+
+                    size_t cur = 0;
+
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                               node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*(ne0*ne1*ew0);
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CONV_1D_STAGE_0:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONV_1D_STAGE_1:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONV_TRANSPOSE_1D:
+                {
+                    n_tasks = n_threads;
+
+                    GGML_ASSERT(node->src[0]->ne[3] == 1);
+                    GGML_ASSERT(node->src[1]->ne[2] == 1);
+                    GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+                    const int64_t ne00 = node->src[0]->ne[0];  // K
+                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
+
+                    const int64_t ne10 = node->src[1]->ne[0];  // L
+                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
+
+                    size_t cur = 0;
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
+                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                               node->src[1]->type == GGML_TYPE_F32) {
+                        cur += sizeof(float)*ne00*ne01*ne02;
+                        cur += sizeof(float)*ne10*ne11;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CONV_2D:
+                {
+                    n_tasks = n_threads;
+
+                    const int64_t ne00 = node->src[0]->ne[0]; // W
+                    const int64_t ne01 = node->src[0]->ne[1]; // H
+                    const int64_t ne02 = node->src[0]->ne[2]; // C
+                    const int64_t ne03 = node->src[0]->ne[3]; // N
+
+                    const int64_t ne10 = node->src[1]->ne[0]; // W
+                    const int64_t ne11 = node->src[1]->ne[1]; // H
+                    const int64_t ne12 = node->src[1]->ne[2]; // C
+
+                    const int64_t ne0 = node->ne[0];
+                    const int64_t ne1 = node->ne[1];
+                    const int64_t ne2 = node->ne[2];
+                    const int64_t ne3 = node->ne[3];
+                    const int64_t nk = ne00*ne01;
+                    const int64_t ew0 = nk * ne02;
+
+                    UNUSED(ne03);
+                    UNUSED(ne2);
+
+                    size_t cur = 0;
+
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        // im2col: [N*OH*OW, IC*KH*KW]
+                        cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                               node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*      (ne10*ne11*ne12);
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CONV_2D_STAGE_0:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONV_2D_STAGE_1:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONV_TRANSPOSE_2D:
+                {
+                    n_tasks = n_threads;
+
+                    const int64_t ne00 = node->src[0]->ne[0]; // W
+                    const int64_t ne01 = node->src[0]->ne[1]; // H
+                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
+                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
+
+                    const int64_t ne10 = node->src[1]->ne[0]; // W
+                    const int64_t ne11 = node->src[1]->ne[1]; // H
+                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
+
+                    size_t cur = 0;
+                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
+                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_POOL_1D:
+            case GGML_OP_POOL_2D:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_UPSCALE:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_FLASH_ATTN:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
+
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_FF:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_ATTN_BACK:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    const int64_t    D = node->src[0]->ne[0];
+                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_WIN_PART:
+            case GGML_OP_WIN_UNPART:
+            case GGML_OP_GET_REL_POS:
+            case GGML_OP_MAP_UNARY:
+            case GGML_OP_MAP_BINARY:
+            case GGML_OP_MAP_CUSTOM1_F32:
+            case GGML_OP_MAP_CUSTOM2_F32:
+            case GGML_OP_MAP_CUSTOM3_F32:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_MAP_CUSTOM1:
+                {
+                    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
+                        n_tasks = n_threads;
+                    } else {
+                        n_tasks = MIN(p->n_tasks, n_threads);
+                    }
+                } break;
+            case GGML_OP_MAP_CUSTOM2:
+                {
+                    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
+                        n_tasks = n_threads;
+                    } else {
+                        n_tasks = MIN(p->n_tasks, n_threads);
+                    }
+                } break;
+            case GGML_OP_MAP_CUSTOM3:
+                {
+                    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
+                        n_tasks = n_threads;
+                    } else {
+                        n_tasks = MIN(p->n_tasks, n_threads);
+                    }
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_NONE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_COUNT:
+                {
+                    GGML_ASSERT(false);
+                } break;
+        }
+
+        cplan.n_tasks[i] = n_tasks;
+    }
+
+    if (work_size > 0) {
+        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+    }
+
+    cplan.n_threads = n_threads;
+    cplan.work_size = work_size;
+    cplan.work_data = NULL;
+
+    return cplan;
+}
+
+int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+    {
+        GGML_ASSERT(cplan);
+        GGML_ASSERT(cplan->n_threads > 0);
+
+        if (cplan->work_size > 0) {
+            GGML_ASSERT(cplan->work_data);
+        }
+
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            cgraph->nodes[i]->n_dst_curr = cgraph->nodes[i]->n_dst;
+            if (cgraph->nodes[i]->op != GGML_OP_NONE) {
+                GGML_ASSERT(cplan->n_tasks[i] > 0);
+            }
+        }
+
+        for (int i = 0; i < cgraph->n_leafs; ++i) {
+            cgraph->leafs[i]->n_dst_curr = cgraph->leafs[i]->n_dst;
+        }
+    }
+
+    const int n_threads = cplan->n_threads;
+
+    struct ggml_compute_state_shared state_shared = {
+        /*.cgraph                  =*/ cgraph,
+        /*.cgraph_plan             =*/ cplan,
+        /*.perf_node_start_cycles  =*/ 0,
+        /*.perf_node_start_time_us =*/ 0,
+        /*.n_threads               =*/ n_threads,
+        /*.n_active                =*/ n_threads,
+        /*.node_n                  =*/ -1,
+        /*.abort_callback          =*/ NULL,
+        /*.abort_callback_data     =*/ NULL,
+    };
+    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+
+    // create thread pool
+    if (n_threads > 1) {
+        for (int j = 1; j < n_threads; ++j) {
+            workers[j] = (struct ggml_compute_state) {
+                .thrd   = 0,
+                .ith = j,
+                .shared = &state_shared,
+            };
+
+            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+            GGML_ASSERT(rc == 0);
+            UNUSED(rc);
+        }
+    }
+
+    workers[0].ith = 0;
+    workers[0].shared = &state_shared;
+
+    const int64_t perf_start_cycles  = ggml_perf_cycles();
+    const int64_t perf_start_time_us = ggml_perf_time_us();
+
+    // this is a work thread too
+    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
+
+    // don't leave affinity set on the main thread
+    clear_numa_thread_affinity();
+
+    // join or kill thread pool
+    if (n_threads > 1) {
+        for (int j = 1; j < n_threads; j++) {
+            const int rc = ggml_thread_join(workers[j].thrd, NULL);
+            GGML_ASSERT(rc == 0);
+        }
+    }
+
+    // performance stats (graph)
+    {
+        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
+        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
+
+        cgraph->perf_runs++;
+        cgraph->perf_cycles  += perf_cycles_cur;
+        cgraph->perf_time_us += perf_time_us_cur;
+
+        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
+                __func__, cgraph->perf_runs,
+                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
+                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
+                (double) perf_time_us_cur     / 1000.0,
+                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
+    }
+
+    return compute_status;
+}
+
+void ggml_graph_reset(struct ggml_cgraph * cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * grad = cgraph->grads[i];
+
+        if (grad) {
+            ggml_set_zero(grad);
+        }
+    }
+}
+
+void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
+    ggml_graph_compute(cgraph, &cplan);
+}
+
+struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_tensor * leaf = cgraph->leafs[i];
+
+        if (strcmp(leaf->name, name) == 0) {
+            return leaf;
+        }
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        if (strcmp(node->name, name) == 0) {
+            return node;
+        }
+    }
+
+    return NULL;
+}
+
+static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
+            ggml_type_name(tensor->type),
+            ggml_op_name  (tensor->op),
+            tensor->n_dims,
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->data,
+            tensor->name);
+}
+
+static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
+            arg,
+            ggml_type_name(tensor->type),
+            ggml_op_name  (tensor->op),
+            tensor->n_dims,
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->data,
+            tensor->name);
+}
+
+void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
+    uint64_t size_eval = 0;
+
+    // compute size of intermediate results
+    // TODO: does not take into account scratch buffers !!!!
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
+    }
+
+    // print
+    {
+        FILE * fout = stdout;
+
+        fprintf(fout, "\n");
+        fprintf(fout, "%-16s %8x\n", "magic",        GGML_FILE_MAGIC);
+        fprintf(fout, "%-16s %8d\n", "version",      GGML_FILE_VERSION);
+        fprintf(fout, "%-16s %8d\n", "leafs",        cgraph->n_leafs);
+        fprintf(fout, "%-16s %8d\n", "nodes",        cgraph->n_nodes);
+        fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
+
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
+                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
+
+        for (int i = 0; i < cgraph->n_leafs; ++i) {
+            ggml_graph_export_leaf(cgraph->leafs[i], fout);
+
+            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
+            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
+            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
+        }
+
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
+                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
+
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
+
+            for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                if (cgraph->nodes[i]->src[j]) {
+                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
+                }
+            }
+
+            fprintf(fout, "\n");
+        }
+
+        fprintf(fout, "\n");
+    }
+
+    // write binary data
+    {
+        FILE * fout = fopen(fname, "wb");
+
+        if (!fout) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return;
+        }
+
+        // header
+        {
+            const uint32_t magic   = GGML_FILE_MAGIC;
+            const uint32_t version = GGML_FILE_VERSION;
+            const uint32_t n_leafs = cgraph->n_leafs;
+            const uint32_t nodes   = cgraph->n_nodes;
+
+            fwrite(&magic,     sizeof(uint32_t), 1, fout);
+            fwrite(&version,   sizeof(uint32_t), 1, fout);
+            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
+            fwrite(&nodes,     sizeof(uint32_t), 1, fout);
+            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
+        }
+
+        // leafs
+        {
+            for (int i = 0; i < cgraph->n_leafs; ++i) {
+                const struct ggml_tensor * tensor = cgraph->leafs[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+                const uint32_t n_dims = tensor->n_dims;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
+                }
+
+                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
+                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
+
+                // dump the data
+                // TODO: pad this to 32 byte boundary
+                {
+                    const size_t size = ggml_nbytes(tensor);
+
+                    fwrite(tensor->data, sizeof(char), size, fout);
+                }
+            }
+        }
+
+        // nodes
+        {
+            for (int i = 0; i < cgraph->n_nodes; ++i) {
+                const struct ggml_tensor * tensor = cgraph->nodes[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+                const uint32_t n_dims = tensor->n_dims;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
+                }
+
+                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
+                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
+
+                // output the op arguments
+                {
+                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
+
+                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                        args[j] = tensor->src[j];
+                    }
+
+                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                        if (args[j]) {
+                            int32_t idx = -1;
+
+                            // check if leaf
+                            {
+                                for (int k = 0; k < cgraph->n_leafs; ++k) {
+                                    if (args[j] == cgraph->leafs[k]) {
+                                        idx = k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            // check if node
+                            if (idx == -1) {
+                                for (int k = 0; k < cgraph->n_nodes; ++k) {
+                                    if (args[j] == cgraph->nodes[k]) {
+                                        idx = GGML_MAX_NODES + k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            if (idx == -1) {
+                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+                                return;
+                            }
+
+                            fwrite(&idx, sizeof(int32_t), 1, fout);
+                        } else {
+                            const int32_t nul = -1;
+
+                            fwrite(&nul, sizeof(int32_t), 1, fout);
+                        }
+                    }
+                }
+            }
+        }
+
+        fclose(fout);
+    }
+}
+
+struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
+    assert(*ctx_data == NULL);
+    assert(*ctx_eval == NULL);
+
+    struct ggml_cgraph result = { 0 };
+
+    struct ggml_tensor * data = NULL;
+
+    // read file into data
+    {
+        FILE * fin = fopen(fname, "rb");
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return result;
+        }
+
+        size_t fsize = 0;
+
+        fseek(fin, 0, SEEK_END);
+        fsize = ftell(fin);
+        fseek(fin, 0, SEEK_SET);
+
+        // create the data context
+        {
+            const size_t overhead = 1*ggml_tensor_overhead();
+
+            struct ggml_init_params params = {
+                .mem_size   = fsize + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = false,
+                .dynamic    = false,
+            };
+
+            *ctx_data = ggml_init(params);
+
+            if (!*ctx_data) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                fclose(fin);
+                return result;
+            }
+        }
+
+        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
+
+        {
+            const size_t ret = fread(data->data, sizeof(char), fsize, fin);
+            if (ret != fsize) {
+                fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
+                fclose(fin);
+                return result;
+            }
+        }
+
+        fclose(fin);
+    }
+
+    // populate result
+    {
+        char * ptr = (char *) data->data;
+
+        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
+
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
+            return result;
+        }
+
+        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
+
+        if (version != GGML_FILE_VERSION) {
+            fprintf(stderr, "%s: invalid version number\n", __func__);
+            return result;
+        }
+
+        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
+        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
+        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
+
+        result.n_leafs = n_leafs;
+        result.n_nodes = n_nodes;
+
+        // create the data context
+        {
+            const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
+
+            struct ggml_init_params params = {
+                .mem_size   = size_eval + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = true,
+                .dynamic    = false,
+            };
+
+            *ctx_eval = ggml_init(params);
+
+            if (!*ctx_eval) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                return result;
+            }
+        }
+
+        // leafs
+        {
+            uint32_t type;
+            uint32_t op;
+            uint32_t n_dims;
+
+            for (uint32_t i = 0; i < n_leafs; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+                int64_t ne[GGML_MAX_DIMS];
+                size_t  nb[GGML_MAX_DIMS];
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                tensor->op = (enum ggml_op) op;
+
+                memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
+                memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
+
+                tensor->data = (void *) ptr;
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                result.leafs[i] = tensor;
+
+                ptr += ggml_nbytes(tensor);
+
+                fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+            }
+        }
+
+        ggml_set_no_alloc(*ctx_eval, false);
+
+        // nodes
+        {
+            uint32_t type;
+            uint32_t op;
+            uint32_t n_dims;
+
+            for (uint32_t i = 0; i < n_nodes; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+                enum ggml_op eop = (enum ggml_op) op;
+
+                int64_t ne[GGML_MAX_DIMS];
+                size_t  nb[GGML_MAX_DIMS];
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                const char * ptr_name      = ptr; ptr += GGML_MAX_NAME;
+                const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
+
+                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
+
+                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
+
+                // parse args
+                for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                    const int32_t arg_idx = ptr_arg_idx[j];
+
+                    if (arg_idx == -1) {
+                        continue;
+                    }
+
+                    if (arg_idx < GGML_MAX_NODES) {
+                        args[j] = result.leafs[arg_idx];
+                    } else {
+                        args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+                    }
+                }
+
+                // create the tensor
+                // "view" operations are handled differently
+                // TODO: handle inplace ops - currently a copy is always made
+
+                struct ggml_tensor * tensor = NULL;
+
+                switch (eop) {
+                    // TODO: implement other view ops
+                    case GGML_OP_RESHAPE:
+                        {
+                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
+                        } break;
+                    case GGML_OP_VIEW:
+                        {
+                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+
+                            size_t offs;
+                            memcpy(&offs, ptr_op_params, sizeof(offs));
+
+                            tensor->data = ((char *) tensor->data) + offs;
+                        } break;
+                    case GGML_OP_TRANSPOSE:
+                        {
+                            tensor = ggml_transpose(*ctx_eval, args[0]);
+                        } break;
+                    case GGML_OP_PERMUTE:
+                        {
+                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+                        } break;
+                    default:
+                        {
+                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                            tensor->op = eop;
+                        } break;
+                }
+
+                memcpy(tensor->name,      ptr_name,      GGML_MAX_NAME);
+                memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                    tensor->src[j] = args[j];
+                }
+
+                result.nodes[i] = tensor;
+
+                fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+            }
+        }
+    }
+
+    return result;
+}
+
+void ggml_graph_print(const struct ggml_cgraph * cgraph) {
+    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
+
+    GGML_PRINT("=== GRAPH ===\n");
+
+    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
+
+        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
+                i,
+                node->ne[0], node->ne[1], node->ne[2],
+                ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
+                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
+                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
+                (double) node->perf_time_us / 1000.0,
+                (double) node->perf_time_us / 1000.0 / node->perf_runs);
+    }
+
+    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_tensor * node = cgraph->leafs[i];
+
+        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
+                i,
+                node->ne[0], node->ne[1],
+                ggml_op_name(node->op),
+                ggml_get_name(node));
+    }
+
+    for (int i = 0; i < GGML_OP_COUNT; i++) {
+        if (perf_total_per_op_us[i] == 0) {
+            continue;
+        }
+
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
+    }
+
+    GGML_PRINT("========================================\n");
+}
+
+// check if node is part of the graph
+static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    if (cgraph == NULL) {
+        return true;
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        if (cgraph->nodes[i] == node) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * parent = cgraph->nodes[i];
+
+        if (parent->grad == node) {
+            return parent;
+        }
+    }
+
+    return NULL;
+}
+
+static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
+    struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
+    struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
+    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
+            gparent0 ? (void *) gparent0 : (void *) parent,
+            gparent0 ? "g" : "x",
+            gparent ? (void *) gparent : (void *) node,
+            gparent ? "g" : "x",
+            gparent ? "empty" : "vee",
+            gparent ? "dashed" : "solid",
+            label);
+}
+
+static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
+    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
+            (void *) parent, "x",
+            (void *) node, "x",
+            label);
+}
+
+void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
+    char color[16];
+
+    FILE * fp = fopen(filename, "w");
+    GGML_ASSERT(fp);
+
+    fprintf(fp, "digraph G {\n");
+    fprintf(fp, "  newrank = true;\n");
+    fprintf(fp, "  rankdir = LR;\n");
+
+    for (int i = 0; i < gb->n_nodes; i++) {
+        struct ggml_tensor * node = gb->nodes[i];
+
+        if (ggml_graph_get_parent(gb, node) != NULL) {
+            continue;
+        }
+
+        if (node->is_param) {
+            snprintf(color, sizeof(color), "yellow");
+        } else if (node->grad) {
+            if (ggml_graph_find(gf, node)) {
+                snprintf(color, sizeof(color), "green");
+            } else {
+                snprintf(color, sizeof(color), "lightblue");
+            }
+        } else {
+            snprintf(color, sizeof(color), "white");
+        }
+
+        fprintf(fp, "  \"%p\" [ "
+                    "style = filled; fillcolor = %s; shape = record; "
+                    "label=\"",
+                (void *) node, color);
+
+        if (strlen(node->name) > 0) {
+            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
+        } else {
+            fprintf(fp, "(%s)|", ggml_type_name(node->type));
+        }
+
+        if (node->n_dims == 2) {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
+        } else {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
+        }
+
+        if (node->grad) {
+            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
+        } else {
+            fprintf(fp, "\"; ]\n");
+        }
+    }
+
+    for (int i = 0; i < gb->n_leafs; i++) {
+        struct ggml_tensor * node = gb->leafs[i];
+
+        snprintf(color, sizeof(color), "pink");
+
+        fprintf(fp, "  \"%p\" [ "
+                    "style = filled; fillcolor = %s; shape = record; "
+                    "label=\"<x>",
+                (void *) node, color);
+
+        if (strlen(node->name) > 0) {
+            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
+        } else {
+            fprintf(fp, "(%s)|", ggml_type_name(node->type));
+        }
+
+        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
+        if (ggml_nelements(node) < 5) {
+            fprintf(fp, " | (");
+            for (int j = 0; j < ggml_nelements(node); j++) {
+                if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
+                    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
+                }
+                else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
+                    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
+                }
+                else {
+                    fprintf(fp, "#");
+                }
+                if (j < ggml_nelements(node) - 1) {
+                    fprintf(fp, ", ");
+                }
+            }
+            fprintf(fp, ")");
+        }
+        fprintf(fp, "\"; ]\n");
+    }
+
+    for (int i = 0; i < gb->n_nodes; i++) {
+        struct ggml_tensor * node = gb->nodes[i];
+
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
+                char label[16];
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
+            }
+        }
+    }
+
+    for (int i = 0; i < gb->n_leafs; i++) {
+        struct ggml_tensor * node = gb->leafs[i];
+
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
+                char label[16];
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
+            }
+        }
+    }
+
+    fprintf(fp, "}\n");
+
+    fclose(fp);
+
+    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
+    int i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_nelements(ps[p]) ;
+        // TODO: add function to set tensor from array
+        for (int64_t j = 0; j < ne; ++j) {
+            ggml_set_f32_1d(ps[p], j, x[i++]);
+        }
+    }
+}
+
+static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
+    int i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_nelements(ps[p]) ;
+        // TODO: add function to get all elements at once
+        for (int64_t j = 0; j < ne; ++j) {
+            x[i++] = ggml_get_f32_1d(ps[p], j);
+        }
+    }
+}
+
+static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
+    int64_t i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_nelements(ps[p]) ;
+        // TODO: add function to get all elements at once
+        for (int64_t j = 0; j < ne; ++j) {
+            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
+        }
+    }
+}
+
+static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) {
+    int64_t i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_nelements(ps[p]) ;
+        // TODO: add function to get all elements at once
+        for (int64_t j = 0; j < ne; ++j) {
+            g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale;
+        }
+    }
+}
+
+//
+// ADAM
+//
+//   ref: https://arxiv.org/pdf/1412.6980.pdf
+//
+
+static enum ggml_opt_result ggml_opt_adam(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_opt_params params,
+        struct ggml_tensor * f,
+        struct ggml_cgraph * gf,
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
+    GGML_ASSERT(ggml_is_scalar(f));
+
+    // these will store the parameters we want to optimize
+    struct ggml_tensor * ps[GGML_MAX_PARAMS];
+
+    int np = 0;
+    int64_t nx = 0;
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        if (gf->nodes[i]->is_param) {
+            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
+
+            GGML_ASSERT(np < GGML_MAX_PARAMS);
+
+            ps[np++] = gf->nodes[i];
+            nx += ggml_nelements(gf->nodes[i]);
+        }
+    }
+
+    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
+        int iter = opt->iter;
+        ggml_opt_init(opt->ctx, opt, params, nx);
+        opt->iter = iter;
+    }
+
+    // constants
+    float sched = params.adam.sched;
+    const float alpha = params.adam.alpha;
+    const float decay = params.adam.decay * alpha;
+    const float beta1 = params.adam.beta1;
+    const float beta2 = params.adam.beta2;
+    const float eps   = params.adam.eps;
+    const float gclip = params.adam.gclip;
+    const int decay_min_ndim = params.adam.decay_min_ndim;
+    const int n_accum = MAX(1, params.n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
+    float * g  = opt->adam.g->data;  // gradients
+    float * m  = opt->adam.m->data;  // first moment
+    float * v  = opt->adam.v->data;  // second moment
+
+    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
+
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
+    bool cancel = false;
+
+    // compute the function value
+    float fx = 0;
+    ggml_set_zero(opt->adam.g);
+    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+        if (callback) {
+            callback(callback_data, accum_step, &sched, &cancel);
+            if (cancel) {
+                return GGML_OPT_CANCEL;
+            }
+        }
+        // ggml_graph_reset  (gf);
+        ggml_set_f32      (f->grad, 1.0f);
+        ggml_graph_compute(gb, &cplan);
+        ggml_opt_acc_grad(np, ps, g, accum_norm);
+        fx += ggml_get_f32_1d(f, 0);
+    }
+    fx *= accum_norm;
+
+    opt->adam.fx_prev = fx;
+    opt->adam.fx_best = opt->adam.fx_prev;
+    if (pf) {
+        pf[opt->iter % params.past] = opt->adam.fx_prev;
+    }
+
+    opt->loss_before = opt->adam.fx_prev;
+    opt->loss_after  = opt->adam.fx_prev;
+
+    // initialize
+    if (opt->just_initialized) {
+        opt->adam.n_no_improvement = 0;
+        opt->just_initialized = false;
+    }
+
+    float * fx_best = &opt->adam.fx_best;
+    float * fx_prev = &opt->adam.fx_prev;
+    int * n_no_improvement = &opt->adam.n_no_improvement;
+
+    int iter0 = opt->iter;
+
+    // run the optimizer
+    for (int t = 0; t < params.adam.n_iter; ++t) {
+        opt->iter = iter0 + t + 1;
+        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
+
+        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
+        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
+        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));
+
+        for (int i = 0; i < np; ++i) {
+            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
+                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
+        }
+
+        const int64_t t_start_wall = ggml_time_us();
+        const int64_t t_start_cpu = ggml_cycles();
+        UNUSED(t_start_wall);
+        UNUSED(t_start_cpu);
+
+        {
+            float gnorm = 1.0f;
+            if (gclip > 0.0f) {
+                // gradient clipping
+                ggml_float sum = 0.0;
+                for (int64_t i = 0; i < nx; ++i) {
+                    sum += (ggml_float)(g[i]*g[i]);
+                }
+                ggml_float norm = sqrt(sum);
+                if (norm > (ggml_float) gclip) {
+                    gnorm = (float) ((ggml_float) gclip / norm);
+                }
+            }
+            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
+            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
+            int64_t i = 0;
+            for (int p = 0; p < np; ++p) {
+                const int64_t ne = ggml_nelements(ps[p]);
+                const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
+                for (int64_t j = 0; j < ne; ++j) {
+                    float x  = ggml_get_f32_1d(ps[p], j);
+                    float g_ = g[i]*gnorm;
+                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
+                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
+                    float mh = m[i]*beta1h;
+                    float vh = v[i]*beta2h;
+                    vh = sqrtf(vh) + eps;
+                    x  = x*(1.0f - p_decay) - mh/vh;
+                    ggml_set_f32_1d(ps[p], j, x);
+                    ++i;
+                }
+            }
+        }
+
+        fx = 0;
+        ggml_set_zero(opt->adam.g);
+        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+            if (callback) {
+                callback(callback_data, accum_step, &sched, &cancel);
+                if (cancel) {
+                    return GGML_OPT_CANCEL;;
+                }
+            }
+            // ggml_graph_reset  (gf);
+            ggml_set_f32      (f->grad, 1.0f);
+            ggml_graph_compute(gb, &cplan);
+            ggml_opt_acc_grad(np, ps, g, accum_norm);
+            fx += ggml_get_f32_1d(f, 0);
+        }
+        fx *= accum_norm;
+
+        opt->loss_after = fx;
+
+        // check convergence
+        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
+            GGML_PRINT_DEBUG("converged\n");
+
+            return GGML_OPT_OK;
+        }
+
+        // delta-based convergence test
+        if (pf != NULL) {
+            // need at least params.past iterations to start checking for convergence
+            if (params.past <= iter0 + t) {
+                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
+
+                if (fabsf(rate) < params.delta) {
+                    return GGML_OPT_OK;
+                }
+            }
+
+            pf[(iter0 + t)%params.past] = fx;
+        }
+
+        // check for improvement
+        if (params.max_no_improvement > 0) {
+            if (fx_best[0] > fx) {
+                fx_best[0] = fx;
+                n_no_improvement[0] = 0;
+            } else {
+                ++n_no_improvement[0];
+
+                if (n_no_improvement[0] >= params.max_no_improvement) {
+                    return GGML_OPT_OK;
+                }
+            }
+        }
+
+        fx_prev[0] = fx;
+
+        {
+            const int64_t t_end_cpu = ggml_cycles();
+            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
+            UNUSED(t_end_cpu);
+
+            const int64_t t_end_wall = ggml_time_us();
+            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
+            UNUSED(t_end_wall);
+        }
+    }
+
+    return GGML_OPT_DID_NOT_CONVERGE;
+}
+
+//
+// L-BFGS
+//
+// the L-BFGS implementation below is based on the following implementation:
+//
+//   https://github.com/chokkan/liblbfgs
+//
+
+struct ggml_lbfgs_iteration_data {
+    float alpha;
+    float ys;
+    float * s;
+    float * y;
+};
+
+static enum ggml_opt_result linesearch_backtracking(
+        const struct ggml_opt_params * params,
+        int nx,
+        float * x,
+        float * fx,
+        float * g,
+        float * d,
+        float * step,
+        const float * xp,
+        struct ggml_tensor * f,
+        struct ggml_cgraph * gb,
+        struct ggml_cplan  * cplan,
+        const int np,
+        struct ggml_tensor * ps[],
+        bool * cancel,
+        ggml_opt_callback callback,
+        void * callback_data) {
+    int count = 0;
+
+    float width  = 0.0f;
+    float dg     = 0.0f;
+    float finit  = 0.0f;
+    float dginit = 0.0f;
+    float dgtest = 0.0f;
+
+    const float dec = 0.5f;
+    const float inc = 2.1f;
+
+    const int n_accum = MAX(1, params->n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
+    if (*step <= 0.f) {
+        return GGML_LINESEARCH_INVALID_PARAMETERS;
+    }
+
+    // compute the initial gradient in the search direction
+    ggml_vec_dot_f32(nx, &dginit, g, d);
+
+    // make sure that d points to a descent direction
+    if (0 < dginit) {
+        return GGML_LINESEARCH_FAIL;
+    }
+
+    // initialize local variables
+    finit = *fx;
+    dgtest = params->lbfgs.ftol*dginit;
+
+    while (true) {
+        ggml_vec_cpy_f32(nx, x, xp);
+        ggml_vec_mad_f32(nx, x, d, *step);
+
+        // evaluate the function and gradient values
+        {
+            ggml_opt_set_params(np, ps, x);
+
+            *fx = 0;
+            memset(g, 0, sizeof(float)*nx);
+            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+                if (callback) {
+                    // LBFG-S does not support learning rate -> ignore learning schedule
+                    float sched = 0;
+                    callback(callback_data, accum_step, &sched, cancel);
+                    if (*cancel) {
+                        return GGML_OPT_CANCEL;
+                    }
+                }
+                // ggml_graph_reset  (gf);
+                ggml_set_f32      (f->grad, 1.0f);
+                ggml_graph_compute(gb, cplan);
+                ggml_opt_acc_grad(np, ps, g, accum_norm);
+                *fx += ggml_get_f32_1d(f, 0);
+            }
+            *fx *= accum_norm;
+
+        }
+
+        ++count;
+
+        if (*fx > finit + (*step)*dgtest) {
+            width = dec;
+        } else {
+            // Armijo condition is satisfied
+            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
+                return count;
+            }
+
+            ggml_vec_dot_f32(nx, &dg, g, d);
+
+            // check the Wolfe condition
+            if (dg < params->lbfgs.wolfe * dginit) {
+                width = inc;
+            } else {
+                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
+                    // regular Wolfe conditions
+                    return count;
+                }
+
+                if(dg > -params->lbfgs.wolfe*dginit) {
+                    width = dec;
+                } else {
+                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
+                    return count;
+                }
+            }
+        }
+
+        if (*step < params->lbfgs.min_step) {
+            return GGML_LINESEARCH_MINIMUM_STEP;
+        }
+        if (*step > params->lbfgs.max_step) {
+            return GGML_LINESEARCH_MAXIMUM_STEP;
+        }
+        if (params->lbfgs.max_linesearch <= count) {
+            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
+        }
+
+        (*step) *= width;
+    }
+
+    GGML_UNREACHABLE();
+}
+
+static enum ggml_opt_result ggml_opt_lbfgs(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_opt_params params,
+        struct ggml_tensor * f,
+        struct ggml_cgraph * gf,
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
+    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
+        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
+        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
+            return GGML_OPT_INVALID_WOLFE;
+        }
+    }
+
+    const int m = params.lbfgs.m;
+
+    // these will store the parameters we want to optimize
+    struct ggml_tensor * ps[GGML_MAX_PARAMS];
+
+    int np = 0;
+    int nx = 0;
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        if (gf->nodes[i]->is_param) {
+            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
+
+            GGML_ASSERT(np < GGML_MAX_PARAMS);
+
+            ps[np++] = gf->nodes[i];
+            nx += ggml_nelements(gf->nodes[i]);
+        }
+    }
+
+    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
+        int iter = opt->iter;
+        ggml_opt_init(ctx, opt, params, nx);
+        opt->iter = iter;
+    }
+
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
+    float * x  = opt->lbfgs.x->data;  // current parameters
+    float * xp = opt->lbfgs.xp->data; // previous parameters
+    float * g  = opt->lbfgs.g->data;  // current gradient
+    float * gp = opt->lbfgs.gp->data; // previous gradient
+    float * d  = opt->lbfgs.d->data;  // search direction
+
+    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
+
+    const int n_accum = MAX(1, params.n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
+    float fx    = 0.0f; // cost function value
+    float xnorm = 0.0f; // ||x||
+    float gnorm = 0.0f; // ||g||
+
+    // initialize x from the graph nodes
+    ggml_opt_get_params(np, ps, x);
+
+    // the L-BFGS memory
+    float * lm_alpha = opt->lbfgs.lmal->data;
+    float * lm_ys    = opt->lbfgs.lmys->data;
+    float * lm_s     = opt->lbfgs.lms->data;
+    float * lm_y     = opt->lbfgs.lmy->data;
+
+    bool cancel = false;
+
+    // evaluate the function value and its gradient
+    {
+        ggml_opt_set_params(np, ps, x);
+
+        fx = 0;
+        memset(g, 0, sizeof(float)*nx);
+        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+            if (callback) {
+                // LBFG-S does not support learning rate -> ignore learning schedule
+                float sched = 0;
+                callback(callback_data, accum_step, &sched, &cancel);
+                if (cancel) {
+                    return GGML_OPT_CANCEL;
+                }
+            }
+            // ggml_graph_reset  (gf);
+            ggml_set_f32      (f->grad, 1.0f);
+            ggml_graph_compute(gb, &cplan);
+            ggml_opt_acc_grad(np, ps, g, accum_norm);
+            fx += ggml_get_f32_1d(f, 0);
+        }
+        fx *= accum_norm;
+
+        opt->loss_before = fx;
+        opt->loss_after  = fx;
+    }
+
+    // search direction = -gradient
+    ggml_vec_neg_f32(nx, d, g);
+
+    // ||x||, ||g||
+    ggml_vec_norm_f32(nx, &xnorm, x);
+    ggml_vec_norm_f32(nx, &gnorm, g);
+
+    if (xnorm < 1.0f) {
+        xnorm = 1.0f;
+    }
+
+    // already optimized
+    if (gnorm/xnorm <= params.lbfgs.eps) {
+        return GGML_OPT_OK;
+    }
+
+    if (opt->just_initialized) {
+        if (pf) {
+            pf[0] = fx;
+        }
+        opt->lbfgs.fx_best = fx;
+
+        // initial step
+        ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
+        opt->lbfgs.j                = 0;
+        opt->lbfgs.k                = 1;
+        opt->lbfgs.end              = 0;
+        opt->lbfgs.n_no_improvement = 0;
+        opt->just_initialized       = false;
+    }
+
+    float * fx_best        = &opt->lbfgs.fx_best;
+    float * step           = &opt->lbfgs.step;
+    int * j                = &opt->lbfgs.j;
+    int * k                = &opt->lbfgs.k;
+    int * end              = &opt->lbfgs.end;
+    int * n_no_improvement = &opt->lbfgs.n_no_improvement;
+
+    int ls     = 0;
+    int bound  = 0;
+
+    float ys   = 0.0f;
+    float yy   = 0.0f;
+    float beta = 0.0f;
+
+    int it = 0;
+
+    while (true) {
+        // store the current position and gradient vectors
+        ggml_vec_cpy_f32(nx, xp, x);
+        ggml_vec_cpy_f32(nx, gp, g);
+
+        // TODO: instead of passing &cancel here, use the return code of the linesearch
+        //       to determine if the optimization should be cancelled
+        //       this is a simple change, but not doing this atm, since I don't have a nice
+        //       way to test and don't want to break something with so many changes lined up
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
+        if (cancel) {
+            return GGML_OPT_CANCEL;
+        }
+
+        if (ls < 0) {
+            // linesearch failed - go back to the previous point and return
+            ggml_vec_cpy_f32(nx, x, xp);
+            ggml_vec_cpy_f32(nx, g, gp);
+
+            return ls;
+        }
+
+        opt->loss_after = fx;
+
+        ggml_vec_norm_f32(nx, &xnorm, x);
+        ggml_vec_norm_f32(nx, &gnorm, g);
+
+        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));
+
+        if (xnorm < 1.0f) {
+            xnorm = 1.0f;
+        }
+        if (gnorm/xnorm <= params.lbfgs.eps) {
+            // converged
+            return GGML_OPT_OK;
+        }
+
+        // delta-based convergence test
+        if (pf != NULL) {
+            // need at least params.past iterations to start checking for convergence
+            if (params.past <= k[0]) {
+                const float rate = (pf[k[0]%params.past] - fx)/fx;
+
+                if (fabsf(rate) < params.delta) {
+                    return GGML_OPT_OK;
+                }
+            }
+
+            pf[k[0]%params.past] = fx;
+        }
+
+        // check for improvement
+        if (params.max_no_improvement > 0) {
+            if (fx < fx_best[0]) {
+                fx_best[0] = fx;
+                n_no_improvement[0] = 0;
+            } else {
+                n_no_improvement[0]++;
+
+                if (n_no_improvement[0] >= params.max_no_improvement) {
+                    return GGML_OPT_OK;
+                }
+            }
+        }
+
+        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
+            // reached the maximum number of iterations
+            return GGML_OPT_DID_NOT_CONVERGE;
+        }
+
+        // update vectors s and y:
+        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
+        //   y_{k+1} = g_{k+1} - g_{k}.
+        //
+        ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
+        ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);
+
+        // compute scalars ys and yy:
+        //     ys = y^t \cdot s    -> 1 / \rho.
+        //     yy = y^t \cdot y.
+        //
+        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
+        ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
+
+        lm_ys[end[0]] = ys;
+
+        // find new search direction
+        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
+
+        bound = (m <= k[0]) ? m : k[0];
+        k[0]++;
+        it++;
+        end[0] = (end[0] + 1)%m;
+
+        // initialize search direction with -g
+        ggml_vec_neg_f32(nx, d, g);
+
+        j[0] = end[0];
+        for (int i = 0; i < bound; ++i) {
+            j[0] = (j[0] + m - 1) % m;
+            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
+            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
+            lm_alpha[j[0]] /= lm_ys[j[0]];
+            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
+            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
+        }
+
+        ggml_vec_scale_f32(nx, d, ys/yy);
+
+        for (int i = 0; i < bound; ++i) {
+            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
+            ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
+            beta /= lm_ys[j[0]];
+            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
+            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
+            j[0] = (j[0] + 1)%m;
+        }
+
+        step[0] = 1.0;
+    }
+
+    GGML_UNREACHABLE();
+}
+
+struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
+    struct ggml_opt_params result;
+
+    switch (type) {
+        case GGML_OPT_ADAM:
+            {
+                result = (struct ggml_opt_params) {
+                    .type      = GGML_OPT_ADAM,
+                    .n_threads = 1,
+                    .past      = 0,
+                    .delta     = 1e-5f,
+
+                    .max_no_improvement = 100,
+
+                    .print_forward_graph  = true,
+                    .print_backward_graph = true,
+
+                    .n_gradient_accumulation = 1,
+
+                    .adam = {
+                        .n_iter = 10000,
+                        .sched  = 1.000f,
+                        .decay  = 0.0f,
+                        .decay_min_ndim = 2,
+                        .alpha  = 0.001f,
+                        .beta1  = 0.9f,
+                        .beta2  = 0.999f,
+                        .eps    = 1e-8f,
+                        .eps_f  = 1e-5f,
+                        .eps_g  = 1e-3f,
+                        .gclip  = 0.0f,
+                    },
+                };
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                result = (struct ggml_opt_params) {
+                    .type      = GGML_OPT_LBFGS,
+                    .n_threads = 1,
+                    .past      = 0,
+                    .delta     = 1e-5f,
+
+                    .max_no_improvement = 0,
+
+                    .print_forward_graph  = true,
+                    .print_backward_graph = true,
+
+                    .n_gradient_accumulation = 1,
+
+                    .lbfgs = {
+                        .m              = 6,
+                        .n_iter         = 100,
+                        .max_linesearch = 20,
+
+                        .eps      = 1e-5f,
+                        .ftol     = 1e-4f,
+                        .wolfe    = 0.9f,
+                        .min_step = 1e-20f,
+                        .max_step = 1e+20f,
+
+                        .linesearch = GGML_LINESEARCH_DEFAULT,
+                    },
+                };
+            } break;
+    }
+
+    return result;
+}
+
+GGML_API void ggml_opt_init(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_opt_params params,
+        int64_t nx) {
+    opt->ctx = ctx;
+    opt->params = params;
+    opt->iter = 0;
+    opt->nx = nx;
+    opt->just_initialized = true;
+    if (opt->ctx == NULL) {
+        struct ggml_init_params ctx_opt_params;
+        if (opt->params.type == GGML_OPT_ADAM) {
+            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
+            if (opt->params.past > 0) {
+                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
+            }
+        } else if (opt->params.type == GGML_OPT_LBFGS) {
+            ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
+            if (opt->params.past > 0) {
+                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
+            }
+        }
+        ctx_opt_params.mem_buffer = NULL;
+        ctx_opt_params.no_alloc   = false;
+
+        opt->ctx = ggml_init(ctx_opt_params);
+    }
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                opt->adam.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->adam.pf = params.past > 0
+                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
+                    : NULL;
+                ggml_set_zero(opt->adam.m);
+                ggml_set_zero(opt->adam.v);
+                if (opt->adam.pf) {
+                    ggml_set_zero(opt->adam.pf);
+                }
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                opt->lbfgs.x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.d  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.pf = params.past > 0
+                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
+                    : NULL;
+                opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lms  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
+                opt->lbfgs.lmy  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
+                ggml_set_zero(opt->lbfgs.x);
+                ggml_set_zero(opt->lbfgs.xp);
+                ggml_set_zero(opt->lbfgs.g);
+                ggml_set_zero(opt->lbfgs.gp);
+                ggml_set_zero(opt->lbfgs.d);
+                if (opt->lbfgs.pf) {
+                    ggml_set_zero(opt->lbfgs.pf);
+                }
+                ggml_set_zero(opt->lbfgs.lmal);
+                ggml_set_zero(opt->lbfgs.lmys);
+                ggml_set_zero(opt->lbfgs.lms);
+                ggml_set_zero(opt->lbfgs.lmy);
+            } break;
+    }
+}
+
+enum ggml_opt_result ggml_opt(
+        struct ggml_context * ctx,
+        struct ggml_opt_params params,
+        struct ggml_tensor * f) {
+    bool free_ctx = false;
+    if (ctx == NULL) {
+        struct ggml_init_params params_ctx = {
+            .mem_size   = 16*1024*1024,
+            .mem_buffer = NULL,
+            .no_alloc   = false,
+            .dynamic    = false,
+        };
+
+        ctx = ggml_init(params_ctx);
+        if (ctx == NULL) {
+            return GGML_OPT_NO_CONTEXT;
+        }
+
+        free_ctx = true;
+    }
+
+    enum ggml_opt_result result = GGML_OPT_OK;
+
+    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
+
+    ggml_opt_init(ctx, opt, params, 0);
+    result = ggml_opt_resume(ctx, opt, f);
+
+    if (free_ctx) {
+        ggml_free(ctx);
+    }
+
+    return result;
+}
+
+enum ggml_opt_result ggml_opt_resume(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_tensor * f) {
+
+    // build forward + backward compute graphs
+    struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+    struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+
+    struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
+    struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
+
+    *gf = ggml_build_forward (f);
+    *gb = ggml_build_backward(ctx, gf, true);
+
+    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
+}
+
+enum ggml_opt_result ggml_opt_resume_g(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_tensor * f,
+        struct ggml_cgraph * gf,
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
+
+    // build forward + backward compute graphs
+    enum ggml_opt_result result = GGML_OPT_OK;
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
+            } break;
+    }
+
+    if (opt->params.print_forward_graph) {
+        ggml_graph_print   (gf);
+        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
+    }
+
+    if (opt->params.print_backward_graph) {
+        ggml_graph_print   (gb);
+        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
+    }
+
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK4_0 == 0);
+    const int nb = k / QK4_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
+
+        quantize_row_q4_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < QK4_0; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK4_0*sizeof(block_q4_0));
+}
+
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK4_1 == 0);
+    const int nb = k / QK4_1;
+
+    for (int b = 0; b < n; b += k) {
+        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
+
+        quantize_row_q4_1_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < QK4_1; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK4_1*sizeof(block_q4_1));
+}
+
+size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK5_0 == 0);
+    const int nb = k / QK5_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
+
+        quantize_row_q5_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int j = 0; j < QK5_0; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK5_0*sizeof(block_q5_0));
+}
+
+size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK5_1 == 0);
+    const int nb = k / QK5_1;
+
+    for (int b = 0; b < n; b += k) {
+        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
+
+        quantize_row_q5_1_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int j = 0; j < QK5_1; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK5_1*sizeof(block_q5_1));
+}
+
+size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
+
+        quantize_row_q8_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < QK8_0; ++j) {
+                const int8_t vi = y[i].qs[j];
+
+                hist[vi/16 + 8]++;
+            }
+        }
+    }
+
+    return (n/QK8_0*sizeof(block_q8_0));
+}
+
+size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
+    size_t result = 0;
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            {
+                GGML_ASSERT(start % QK4_0 == 0);
+                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
+                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q4_1:
+            {
+                GGML_ASSERT(start % QK4_1 == 0);
+                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
+                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q5_0:
+            {
+                GGML_ASSERT(start % QK5_0 == 0);
+                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
+                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q5_1:
+            {
+                GGML_ASSERT(start % QK5_1 == 0);
+                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
+                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q8_0:
+            {
+                GGML_ASSERT(start % QK8_0 == 0);
+                block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
+                result = ggml_quantize_q8_0(src + start, block, n, n, hist);
+            } break;
+#ifdef GGML_USE_K_QUANTS
+        case GGML_TYPE_Q2_K:
+            {
+                GGML_ASSERT(start % QK_K == 0);
+                block_q2_K * block = (block_q2_K*)dst + start / QK_K;
+                result = ggml_quantize_q2_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q3_K:
+            {
+                GGML_ASSERT(start % QK_K == 0);
+                block_q3_K * block = (block_q3_K*)dst + start / QK_K;
+                result = ggml_quantize_q3_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q4_K:
+            {
+                GGML_ASSERT(start % QK_K == 0);
+                block_q4_K * block = (block_q4_K*)dst + start / QK_K;
+                result = ggml_quantize_q4_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q5_K:
+            {
+                GGML_ASSERT(start % QK_K == 0);
+                block_q5_K * block = (block_q5_K*)dst + start / QK_K;
+                result = ggml_quantize_q5_K(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q6_K:
+            {
+                GGML_ASSERT(start % QK_K == 0);
+                block_q6_K * block = (block_q6_K*)dst + start / QK_K;
+                result = ggml_quantize_q6_K(src + start, block, n, n, hist);
+            } break;
+#endif
+        case GGML_TYPE_F16:
+            {
+                int elemsize = sizeof(ggml_fp16_t);
+                ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
+                result = n * elemsize;
+            } break;
+        case GGML_TYPE_F32:
+            {
+                int elemsize = sizeof(float);
+                result = n * elemsize;
+                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
+            } break;
+        default:
+            assert(false);
+    }
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct gguf_str {
+    uint64_t n;  // GGUFv2
+    char * data;
+};
+
+static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
+    [GGUF_TYPE_INT8]    = sizeof(int8_t),
+    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
+    [GGUF_TYPE_INT16]   = sizeof(int16_t),
+    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
+    [GGUF_TYPE_INT32]   = sizeof(int32_t),
+    [GGUF_TYPE_FLOAT32] = sizeof(float),
+    [GGUF_TYPE_BOOL]    = sizeof(bool),
+    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
+    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
+    [GGUF_TYPE_INT64]   = sizeof(int64_t),
+    [GGUF_TYPE_FLOAT64] = sizeof(double),
+    [GGUF_TYPE_ARRAY]   = 0, // undefined
+};
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
+
+static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
+    [GGUF_TYPE_UINT8]   = "u8",
+    [GGUF_TYPE_INT8]    = "i8",
+    [GGUF_TYPE_UINT16]  = "u16",
+    [GGUF_TYPE_INT16]   = "i16",
+    [GGUF_TYPE_UINT32]  = "u32",
+    [GGUF_TYPE_INT32]   = "i32",
+    [GGUF_TYPE_FLOAT32] = "f32",
+    [GGUF_TYPE_BOOL]    = "bool",
+    [GGUF_TYPE_STRING]  = "str",
+    [GGUF_TYPE_ARRAY]   = "arr",
+    [GGUF_TYPE_UINT64]  = "u64",
+    [GGUF_TYPE_INT64]   = "i64",
+    [GGUF_TYPE_FLOAT64] = "f64",
+};
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
+
+union gguf_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    uint64_t uint64;
+    int64_t  int64;
+    double   float64;
+    bool     bool_;
+
+    struct gguf_str str;
+
+    struct {
+        enum gguf_type type;
+
+        uint64_t n;  // GGUFv2
+        void * data;
+    } arr;
+};
+
+struct gguf_kv {
+    struct gguf_str key;
+
+    enum  gguf_type  type;
+    union gguf_value value;
+};
+
+struct gguf_header {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t n_tensors; // GGUFv2
+    uint64_t n_kv;      // GGUFv2
+};
+
+struct gguf_tensor_info {
+    struct gguf_str name;
+
+    uint32_t n_dims;
+    uint64_t ne[GGML_MAX_DIMS];
+
+    enum ggml_type type;
+
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+    // for writing API
+    const void * data;
+    size_t size;
+};
+
+struct gguf_context {
+    struct gguf_header header;
+
+    struct gguf_kv          * kv;
+    struct gguf_tensor_info * infos;
+
+    size_t alignment;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size;      // size of `data` in bytes
+
+    //uint8_t * padding;
+    void * data;
+};
+
+static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
+    const size_t n = fread(dst, 1, size, file);
+    *offset += n;
+    return n == size;
+}
+
+// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
+    p->n    = 0;
+    p->data = NULL;
+
+    bool ok = true;
+
+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
+
+    return ok;
+}
+
+static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
+    p->n    = 0;
+    p->data = NULL;
+
+    bool ok = true;
+
+    uint32_t n = 0;
+    ok = ok && gguf_fread_el(file, &n,       sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
+    ok = ok && gguf_fread_el(file,  p->data, p->n,      offset);
+
+    return ok;
+}
+
+struct gguf_context * gguf_init_empty(void) {
+    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+
+    ctx->header.magic     = GGUF_MAGIC;
+    ctx->header.version   = GGUF_VERSION;
+    ctx->header.n_tensors = 0;
+    ctx->header.n_kv      = 0;
+
+    ctx->kv    = NULL;
+    ctx->infos = NULL;
+
+    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+    ctx->offset    = 0;
+    ctx->size      = 0;
+
+    ctx->data = NULL;
+
+    return ctx;
+}
+
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+    FILE * file = fopen(fname, "rb");
+    if (!file) {
+        return NULL;
+    }
+
+    // offset from start of file
+    size_t offset = 0;
+
+    uint32_t magic = 0;
+
+    // check the magic before making allocations
+    {
+        gguf_fread_el(file, &magic, sizeof(magic), &offset);
+
+        if (magic != GGUF_MAGIC) {
+            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+            fclose(file);
+            return NULL;
+        }
+    }
+
+    bool ok = true;
+
+    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+
+    // read the header
+    {
+        ctx->header.magic = magic;
+
+        ctx->kv    = NULL;
+        ctx->infos = NULL;
+        ctx->data  = NULL;
+
+        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
+
+        if (ctx->header.version == 1) {
+            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+            uint32_t n_tensors = 0;
+            uint32_t n_kv      = 0;
+
+            ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
+            ok = ok && gguf_fread_el(file, &n_kv,      sizeof(n_kv),      &offset);
+
+            ctx->header.n_tensors = n_tensors;
+            ctx->header.n_kv      = n_kv;
+        } else {
+            ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+            ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read header\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
+    }
+
+    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+    bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
+    if (ctx->header.version == 1) {
+        gguf_fread_str = gguf_fread_str_v1;
+    }
+
+    // read the kv pairs
+    {
+        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
+
+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+            struct gguf_kv * kv = &ctx->kv[i];
+
+            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
+
+            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
+            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
+
+            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
+
+            switch (kv->type) {
+                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
+                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
+                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
+                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
+                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
+                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
+                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
+                case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
+                case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
+                case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
+                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
+                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
+                case GGUF_TYPE_ARRAY:
+                    {
+                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
+
+                        if (ctx->header.version == 1) {
+                            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+                            uint32_t n = 0;
+                            ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
+                            kv->value.arr.n = n;
+                        } else {
+                            ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
+                        }
+
+                        switch (kv->value.arr.type) {
+                            case GGUF_TYPE_UINT8:
+                            case GGUF_TYPE_INT8:
+                            case GGUF_TYPE_UINT16:
+                            case GGUF_TYPE_INT16:
+                            case GGUF_TYPE_UINT32:
+                            case GGUF_TYPE_INT32:
+                            case GGUF_TYPE_FLOAT32:
+                            case GGUF_TYPE_UINT64:
+                            case GGUF_TYPE_INT64:
+                            case GGUF_TYPE_FLOAT64:
+                            case GGUF_TYPE_BOOL:
+                                {
+                                    kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
+                                } break;
+                            case GGUF_TYPE_STRING:
+                                {
+                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
+                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
+                                    }
+                                } break;
+                            case GGUF_TYPE_ARRAY:
+                            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+                        }
+                    } break;
+                case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+            }
+
+            if (!ok) {
+                break;
+            }
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
+    }
+
+    // read the tensor infos
+    {
+        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_tensor_info * info = &ctx->infos[i];
+
+            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                info->ne[j] = 1;
+            }
+
+            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
+            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
+            for (uint32_t j = 0; j < info->n_dims; ++j) {
+                if (ctx->header.version == 1) {
+                    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+                    uint32_t t = 0;
+                    ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
+                    info->ne[j] = t;
+                } else {
+                    ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+                }
+            }
+            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
+            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+                fclose(file);
+                gguf_free(ctx);
+                return NULL;
+            }
+        }
+    }
+
+    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+
+    int alignment_idx = gguf_find_key(ctx, "general.alignment");
+    if (alignment_idx != -1) {
+        ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
+    }
+
+    // we require the data section to be aligned, so take into account any padding
+    {
+        const size_t offset_pad = offset % ctx->alignment;
+
+        if (offset_pad != 0) {
+            offset += ctx->alignment - offset_pad;
+            fseek(file, offset, SEEK_SET);
+        }
+    }
+
+    // store the current file offset - this is where the data section starts
+    ctx->offset = offset;
+
+    // compute the total size of the data section, taking into account the alignment
+    {
+        ctx->size = 0;
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_tensor_info * info = &ctx->infos[i];
+
+            const int64_t ne =
+                (int64_t) info->ne[0] *
+                (int64_t) info->ne[1] *
+                (int64_t) info->ne[2] *
+                (int64_t) info->ne[3];
+
+            if (ne % ggml_blck_size(info->type) != 0) {
+                fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
+                        __func__, info->name.data, ne, ggml_blck_size(info->type));
+                fclose(file);
+                gguf_free(ctx);
+                return NULL;
+            }
+
+            const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
+
+            ctx->size += GGML_PAD(size_cur, ctx->alignment);
+        }
+    }
+
+    // load the tensor data only if requested
+    if (params.ctx != NULL) {
+        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
+        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
+        // the ggml_tensor structs to the appropriate locations in the binary blob
+
+        // compute the exact size needed for the new ggml_context
+        const size_t mem_size =
+            params.no_alloc ?
+            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
+            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
+
+        struct ggml_init_params pdata = {
+            .mem_size   = mem_size,
+            .mem_buffer = NULL,
+            .no_alloc   = params.no_alloc,
+        };
+
+        *params.ctx = ggml_init(pdata);
+
+        struct ggml_context * ctx_data = *params.ctx;
+
+        struct ggml_tensor * data = NULL;
+
+        if (!params.no_alloc) {
+            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
+
+            ok = ok && data != NULL;
+
+            // read the binary blob with the tensor data
+            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
+                fclose(file);
+                ggml_free(ctx_data);
+                gguf_free(ctx);
+                return NULL;
+            }
+
+            ctx->data = data->data;
+        }
+
+        ggml_set_no_alloc(ctx_data, true);
+
+        // create the tensors
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            const int64_t ne[GGML_MAX_DIMS] = {
+                ctx->infos[i].ne[0],
+                ctx->infos[i].ne[1],
+                ctx->infos[i].ne[2],
+                ctx->infos[i].ne[3],
+            };
+
+            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+
+            ok = ok && cur != NULL;
+
+            ggml_set_name(cur, ctx->infos[i].name.data);
+
+            if (!ok) {
+                break;
+            }
+
+            // point the data member to the appropriate location in the binary blob using the tensor infos
+            if (!params.no_alloc) {
+              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
+                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
+            }
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
+            fclose(file);
+            ggml_free(ctx_data);
+            gguf_free(ctx);
+            return NULL;
+        }
+
+        ggml_set_no_alloc(ctx_data, params.no_alloc);
+    }
+
+    fclose(file);
+
+    return ctx;
+}
+
+void gguf_free(struct gguf_context * ctx) {
+    if (ctx == NULL) {
+        return;
+    }
+
+    if (ctx->kv) {
+        // free string memory - not great..
+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+            struct gguf_kv * kv = &ctx->kv[i];
+
+            if (kv->key.data) {
+                free(kv->key.data);
+            }
+
+            if (kv->type == GGUF_TYPE_STRING) {
+                if (kv->value.str.data) {
+                    free(kv->value.str.data);
+                }
+            }
+
+            if (kv->type == GGUF_TYPE_ARRAY) {
+                if (kv->value.arr.data) {
+                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
+                        for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
+                            if (str->data) {
+                                free(str->data);
+                            }
+                        }
+                    }
+                    free(kv->value.arr.data);
+                }
+            }
+        }
+
+        free(ctx->kv);
+    }
+
+    if (ctx->infos) {
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_tensor_info * info = &ctx->infos[i];
+
+            if (info->name.data) {
+                free(info->name.data);
+            }
+        }
+
+        free(ctx->infos);
+    }
+
+    GGML_ALIGNED_FREE(ctx);
+}
+
+const char * gguf_type_name(enum gguf_type type) {
+    return GGUF_TYPE_NAME[type];
+}
+
+int gguf_get_version(const struct gguf_context * ctx) {
+    return ctx->header.version;
+}
+
+size_t gguf_get_alignment(const struct gguf_context * ctx) {
+    return ctx->alignment;
+}
+
+size_t gguf_get_data_offset(const struct gguf_context * ctx) {
+    return ctx->offset;
+}
+
+void * gguf_get_data(const struct gguf_context * ctx) {
+    return ctx->data;
+}
+
+int gguf_get_n_kv(const struct gguf_context * ctx) {
+    return ctx->header.n_kv;
+}
+
+int gguf_find_key(const struct gguf_context * ctx, const char * key) {
+    // return -1 if key not found
+    int keyfound = -1;
+
+    const int n_kv = gguf_get_n_kv(ctx);
+
+    for (int i = 0; i < n_kv; ++i) {
+        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
+            keyfound = i;
+            break;
+        }
+    }
+
+    return keyfound;
+}
+
+const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
+    return ctx->kv[key_id].key.data;
+}
+
+enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
+    return ctx->kv[key_id].type;
+}
+
+enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.type;
+}
+
+const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.data;
+}
+
+const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    struct gguf_kv * kv = &ctx->kv[key_id];
+    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
+    return str->data;
+}
+
+int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.n;
+}
+
+uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
+    return ctx->kv[key_id].value.uint8;
+}
+
+int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
+    return ctx->kv[key_id].value.int8;
+}
+
+uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
+    return ctx->kv[key_id].value.uint16;
+}
+
+int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
+    return ctx->kv[key_id].value.int16;
+}
+
+uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
+    return ctx->kv[key_id].value.uint32;
+}
+
+int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
+    return ctx->kv[key_id].value.int32;
+}
+
+float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
+    return ctx->kv[key_id].value.float32;
+}
+
+uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
+    return ctx->kv[key_id].value.uint64;
+}
+
+int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
+    return ctx->kv[key_id].value.int64;
+}
+
+double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
+    return ctx->kv[key_id].value.float64;
+}
+
+bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
+    return ctx->kv[key_id].value.bool_;
+}
+
+const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
+    return ctx->kv[key_id].value.str.data;
+}
+
+int gguf_get_n_tensors(const struct gguf_context * ctx) {
+    return ctx->header.n_tensors;
+}
+
+int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
+    // return -1 if tensor not found
+    int tensorfound = -1;
+
+    const int n_tensors = gguf_get_n_tensors(ctx);
+
+    for (int i = 0; i < n_tensors; ++i) {
+        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
+            tensorfound = i;
+            break;
+        }
+    }
+
+    return tensorfound;
+}
+
+size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
+    return ctx->infos[i].offset;
+}
+
+char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
+    return ctx->infos[i].name.data;
+}
+
+// returns the index
+static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
+    const int idx = gguf_find_key(ctx, key);
+    if (idx >= 0) {
+        return idx;
+    }
+
+    const int n_kv = gguf_get_n_kv(ctx);
+
+    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv[n_kv].key.n    = strlen(key);
+    ctx->kv[n_kv].key.data = strdup(key);
+    ctx->header.n_kv++;
+
+    return n_kv;
+}
+
+void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
+    ctx->kv[idx].value.uint8 = val;
+}
+
+void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type       = GGUF_TYPE_INT8;
+    ctx->kv[idx].value.int8 = val;
+}
+
+void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
+    ctx->kv[idx].value.uint16 = val;
+}
+
+void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_TYPE_INT16;
+    ctx->kv[idx].value.int16 = val;
+}
+
+void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
+    ctx->kv[idx].value.uint32 = val;
+}
+
+void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_TYPE_INT32;
+    ctx->kv[idx].value.int32 = val;
+}
+
+void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
+    ctx->kv[idx].value.float32 = val;
+}
+
+void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type         = GGUF_TYPE_UINT64;
+    ctx->kv[idx].value.uint64 = val;
+}
+
+void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_TYPE_INT64;
+    ctx->kv[idx].value.int64 = val;
+}
+
+void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type          = GGUF_TYPE_FLOAT64;
+    ctx->kv[idx].value.float64 = val;
+}
+
+void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
+    ctx->kv[idx].value.bool_ = val;
+}
+
+void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type           = GGUF_TYPE_STRING;
+    ctx->kv[idx].value.str.n    = strlen(val);
+    ctx->kv[idx].value.str.data = strdup(val);
+}
+
+void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
+    ctx->kv[idx].value.arr.type = type;
+    ctx->kv[idx].value.arr.n    = n;
+    ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
+    memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
+}
+
+void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
+    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
+    ctx->kv[idx].value.arr.n    = n;
+    ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
+    for (int i = 0; i < n; i++) {
+        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
+        str->n    = strlen(data[i]);
+        str->data = strdup(data[i]);
+    }
+}
+
+// set or add KV pairs from another context
+void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
+    for (uint32_t i = 0; i < src->header.n_kv; i++) {
+        switch (src->kv[i].type) {
+            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
+            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
+            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
+            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
+            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
+            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
+            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
+            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
+            case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
+            case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
+            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
+            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
+            case GGUF_TYPE_ARRAY:
+                {
+                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
+                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
+                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
+                        }
+                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
+                        free(data);
+                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
+                        GGML_ASSERT(false && "nested arrays not supported");
+                    } else {
+                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
+                    }
+                } break;
+            case GGUF_TYPE_COUNT:  GGML_ASSERT(false && "invalid type"); break;
+        }
+    }
+}
+
+void gguf_add_tensor(
+             struct gguf_context * ctx,
+        const struct ggml_tensor * tensor) {
+    const int idx = ctx->header.n_tensors;
+    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+
+    ctx->infos[idx].name.n    = strlen(tensor->name);
+    ctx->infos[idx].name.data = strdup(tensor->name);
+
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        ctx->infos[idx].ne[i] = 1;
+    }
+
+    ctx->infos[idx].n_dims = tensor->n_dims;
+    for (int i = 0; i < tensor->n_dims; i++) {
+        ctx->infos[idx].ne[i] = tensor->ne[i];
+    }
+
+    ctx->infos[idx].type   = tensor->type;
+    ctx->infos[idx].offset = 0;
+    ctx->infos[idx].data   = tensor->data;
+    ctx->infos[idx].size   = ggml_nbytes(tensor);
+
+    if (ctx->header.n_tensors > 0) {
+        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
+    }
+
+    ctx->header.n_tensors++;
+}
+
+void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
+    const int idx = gguf_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_ASSERT(false && "tensor not found");
+    }
+
+    ctx->infos[idx].type = type;
+}
+
+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
+    const int idx = gguf_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_ASSERT(false && "tensor not found");
+    }
+
+    ctx->infos[idx].data = data;
+    ctx->infos[idx].size = size;
+
+    // update offsets
+    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
+        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
+    }
+}
+
+//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
+//    fwrite(&val->n,   sizeof(val->n),    1, file);
+//    fwrite(val->data, sizeof(char), val->n, file);
+//}
+//
+//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
+//    fwrite(val, sizeof(char), size, file);
+//}
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+static struct gguf_buf gguf_buf_init(size_t size) {
+    struct gguf_buf buf = {
+        /*buf.data   =*/ size == 0 ? NULL : malloc(size),
+        /*buf.size   =*/ size,
+        /*buf.offset =*/ 0,
+    };
+
+    return buf;
+}
+
+static void gguf_buf_free(struct gguf_buf buf) {
+    if (buf.data) {
+        free(buf.data);
+    }
+}
+
+static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
+    if (buf->offset + size > buf->size) {
+        buf->size = 1.5*(buf->offset + size);
+        if (buf->data) {
+            buf->data = realloc(buf->data, buf->size);
+        }
+    }
+}
+
+static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
+    gguf_buf_grow(buf, sizeof(val->n) + val->n);
+
+    if (buf->data) {
+        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
+    }
+    buf->offset += sizeof(val->n);
+
+    if (buf->data) {
+        memcpy((char *) buf->data + buf->offset, val->data, val->n);
+    }
+    buf->offset += val->n;
+}
+
+static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
+    gguf_buf_grow(buf, el_size);
+
+    if (buf->data) {
+        memcpy((char *) buf->data + buf->offset, val, el_size);
+    }
+    buf->offset += el_size;
+}
+
+static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
+    // write header
+    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
+    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
+    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
+    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
+
+    // write key-value pairs
+    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        struct gguf_kv * kv = &ctx->kv[i];
+
+        gguf_bwrite_str(buf, &kv->key);
+        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
+
+        switch (kv->type) {
+            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
+            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
+            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
+            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
+            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
+            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
+            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
+            case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
+            case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
+            case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
+            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
+            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
+            case GGUF_TYPE_ARRAY:
+                {
+                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
+                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
+
+                    switch (kv->value.arr.type) {
+                        case GGUF_TYPE_UINT8:
+                        case GGUF_TYPE_INT8:
+                        case GGUF_TYPE_UINT16:
+                        case GGUF_TYPE_INT16:
+                        case GGUF_TYPE_UINT32:
+                        case GGUF_TYPE_INT32:
+                        case GGUF_TYPE_FLOAT32:
+                        case GGUF_TYPE_UINT64:
+                        case GGUF_TYPE_INT64:
+                        case GGUF_TYPE_FLOAT64:
+                        case GGUF_TYPE_BOOL:
+                            {
+                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+                            } break;
+                        case GGUF_TYPE_STRING:
+                            {
+                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
+                                }
+                            } break;
+                        case GGUF_TYPE_ARRAY:
+                        case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+                    }
+                } break;
+            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+        }
+    }
+
+    // write tensor infos
+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_tensor_info * info = &ctx->infos[i];
+
+        gguf_bwrite_str(buf, &info->name);
+        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
+        for (uint32_t j = 0; j < info->n_dims; ++j) {
+            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
+        }
+        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
+        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
+    }
+
+    // we require the data section to be aligned, so take into account any padding
+    {
+        const size_t offset     = buf->offset;
+        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
+
+        if (offset_pad != offset) {
+            uint8_t pad = 0;
+            for (size_t i = 0; i < offset_pad - offset; ++i) {
+                gguf_bwrite_el(buf, &pad, sizeof(pad));
+            }
+        }
+    }
+
+    if (only_meta) {
+        return;
+    }
+
+    size_t offset = 0;
+
+    // write tensor data
+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_tensor_info * info = &ctx->infos[i];
+
+        const size_t size     = info->size;
+        const size_t size_pad = GGML_PAD(size, ctx->alignment);
+
+        gguf_bwrite_el(buf, info->data, size);
+
+        if (size_pad != size) {
+            uint8_t pad = 0;
+            for (size_t j = 0; j < size_pad - size; ++j) {
+                gguf_bwrite_el(buf, &pad, sizeof(pad));
+            }
+        }
+
+        GGML_ASSERT(offset == info->offset);
+
+        offset += size_pad;
+    }
+}
+
+void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
+    FILE * file = fopen(fname, "wb");
+    if (!file) {
+        GGML_ASSERT(false && "failed to open file for writing");
+    }
+
+    struct gguf_buf buf = gguf_buf_init(16*1024);
+
+    gguf_write_to_buf(ctx, &buf, only_meta);
+
+    fwrite(buf.data, 1, buf.offset, file);
+
+    gguf_buf_free(buf);
+
+    fclose(file);
+}
+
+size_t gguf_get_meta_size(const struct gguf_context * ctx) {
+    // no allocs - only compute size
+    struct gguf_buf buf = gguf_buf_init(0);
+
+    gguf_write_to_buf(ctx, &buf, true);
+
+    return buf.offset;
+}
+
+void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
+    struct gguf_buf buf = gguf_buf_init(16*1024);
+
+    gguf_write_to_buf(ctx, &buf, true);
+
+    memcpy(data, buf.data, buf.offset);
+
+    gguf_buf_free(buf);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+int ggml_cpu_has_avx(void) {
+#if defined(__AVX__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx2(void) {
+#if defined(__AVX2__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512(void) {
+#if defined(__AVX512F__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_vbmi(void) {
+#if defined(__AVX512VBMI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_vnni(void) {
+#if defined(__AVX512VNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_fma(void) {
+#if defined(__FMA__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_neon(void) {
+#if defined(__ARM_NEON)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_arm_fma(void) {
+#if defined(__ARM_FEATURE_FMA)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_metal(void) {
+#if defined(GGML_USE_METAL)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_f16c(void) {
+#if defined(__F16C__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_fp16_va(void) {
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_wasm_simd(void) {
+#if defined(__wasm_simd128__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_blas(void) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_cublas(void) {
+#if defined(GGML_USE_CUBLAS)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_clblast(void) {
+#if defined(GGML_USE_CLBLAST)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_gpublas(void) {
+    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
+}
+
+int ggml_cpu_has_sse3(void) {
+#if defined(__SSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_ssse3(void) {
+#if defined(__SSSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_vsx(void) {
+#if defined(__POWER9_VECTOR__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/stable-diffusion.cpp/ggml/tests/CMakeLists.txt b/stable-diffusion.cpp/ggml/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e0130cda06f73a2689093220afaade7fa33f414a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/CMakeLists.txt
@@ -0,0 +1,357 @@
+# check systems
+if (NOT UNAME_S)
+    execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
+endif()
+if (NOT UNAME_P)
+    execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
+endif()
+if (NOT UNAME_M)
+    execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
+endif()
+#message(STATUS "UNAME_S: ${UNAME_S}  UNAME_P: ${UNAME_P}  UNAME_M: ${UNAME_M}")
+
+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+if (UNAME_S MATCHES "Darwin")
+    if (NOT UNAME_P MATCHES "arm")
+        execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
+        if (SYSCTL_M MATCHES "1")
+            #set(UNAME_P "arm")
+            #set(UNAME_M "arm64")
+            message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lea
+d to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
+        endif()
+    endif()
+endif()
+
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+    message(STATUS "ARM detected")
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PPC64 detected")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector")
+else()
+    message(STATUS "x86 detected")
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
+    if (UNAME_S MATCHES "Darwin")
+        execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
+        if (AVX1_M MATCHES "AVX1.0")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+        endif()
+        execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
+        if (AVX2_M MATCHES "AVX2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        endif()
+        if (AVX1_M MATCHES "FMA")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+        endif()
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+    elseif (UNAME_S MATCHES "Linux")
+        message(STATUS "Linux detected")
+        execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
+        if (AVX1_M MATCHES "avx")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+        endif()
+        execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
+        if (AVX2_M MATCHES "avx2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        endif()
+        execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
+        if (FMA_M MATCHES "fma")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+        endif()
+        execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
+        if (F16C_M MATCHES "f16c")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+        endif()
+        execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
+        if (SSE3_M MATCHES "sse3")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
+        endif()
+    elseif (UNAME_S MATCHES "Haiku")
+        message(STATUS "Haiku detected")
+	execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
+        if (AVX1_M MATCHES "avx")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+        endif()
+	execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
+        if (AVX2_M MATCHES "avx2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        endif()
+	execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
+        if (FMA_M MATCHES "fma")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+        endif()
+	execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
+        if (F16C_M MATCHES "f16c")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+        endif()
+    elseif (MSVC)
+        if (GGML_AVX512)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512")
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (GGML_AVX512_VBMI)
+                add_compile_definitions(__AVX512VBMI__)
+            endif()
+            if (GGML_AVX512_VNNI)
+                add_compile_definitions(__AVX512VNNI__)
+            endif()
+        elseif (GGML_AVX2)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
+        elseif (GGML_AVX)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
+        endif()
+    else()
+        set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
+    endif()
+endif()
+
+# on APPLE - include Accelerate framework
+if (APPLE AND NOT GGML_NO_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
+if (GGML_OPENBLAS)
+    set(OPENBLAS_INCLUDE_SEARCH_PATHS
+        /usr/include
+        /usr/include/openblas
+        /usr/include/openblas-base
+        /usr/local/include
+        /usr/local/include/openblas
+        /usr/local/include/openblas-base
+        /opt/OpenBLAS/include
+        $ENV{OpenBLAS_HOME}
+        $ENV{OpenBLAS_HOME}/include
+        )
+    find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+    find_library(OPENBLAS_LIB NAMES openblas libopenblas)
+    if (OPENBLAS_LIB)
+        message(STATUS "OpenBLAS found")
+
+        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${OPENBLAS_LIB})
+        set(GGML_EXTRA_INCS  ${GGML_EXTRA_INCS}  ${OPENBLAS_INC})
+        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+    else()
+        message(WARNING "OpenBLAS not found")
+    endif()
+endif()
+
+# undefine NDEBUG so asserts don't get disabled in tests
+add_definitions(-UNDEBUG)
+
+#
+# test-vec0
+
+set(TEST_TARGET test-vec0)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+
+#
+# test-vec1 (x86)
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND "${CMAKE_C_FLAGS}" MATCHES "avx")
+    set(TEST_TARGET test-vec1)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+endif()
+
+#
+# test-vec2 (arm)
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+    set(TEST_TARGET test-vec2)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+endif()
+
+#
+# test-grad0
+
+set(TEST_TARGET test-grad0)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-opt
+
+set(TEST_TARGET test-opt)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-quantize-fns
+
+set(TEST_TARGET test-quantize-fns)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-quantize-perf
+
+set(TEST_TARGET test-quantize-perf)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-mul-mat0
+
+set(TEST_TARGET test-mul-mat0)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+if (MSVC)
+    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
+endif()
+target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-mul-mat1 (arm)
+
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
+    set(TEST_TARGET test-mul-mat1)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
+endif()
+
+#
+# test-blas0 (arm)
+
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
+    set(TEST_TARGET test-blas0)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
+    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> 128 128 128)
+    set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+endif()
+
+#
+# test-mul-mat2
+
+set(TEST_TARGET test-mul-mat2)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test0
+
+set(TEST_TARGET test0)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test1
+
+set(TEST_TARGET test1)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+if (MSVC)
+    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
+endif()
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test2
+
+set(TEST_TARGET test2)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test3
+
+set(TEST_TARGET test3)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-pool
+
+set(TEST_TARGET test-pool)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+if (MSVC)
+    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
+endif()
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-conv-transpose
+
+set(TEST_TARGET test-conv-transpose)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+
+#
+# test-rel-pos
+
+set(TEST_TARGET test-rel-pos)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+
+#
+# test-svd0 (arm/x86)
+
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
+    set(TEST_TARGET test-svd0)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND GGML_OPENBLAS)
+    set(TEST_TARGET test-svd0)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
+    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
+endif()
+
+#
+# test-customop
+
+set(TEST_TARGET test-customop)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+if (MSVC)
+    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
+endif()
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+#
+# test-xpos
+
+set(TEST_TARGET test-xpos)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
diff --git a/stable-diffusion.cpp/ggml/tests/test-blas0.c b/stable-diffusion.cpp/ggml/tests/test-blas0.c
new file mode 100644
index 0000000000000000000000000000000000000000..cd0cc5ffd2df45bfff578c76ecab2fd6e4f30d26
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-blas0.c
@@ -0,0 +1,267 @@
+#include "ggml.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include <sys/time.h>
+
+#include <arm_neon.h>
+
+#include <Accelerate/Accelerate.h>
+
+uint64_t get_time_us(void) {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+//
+// naive implementation
+//
+
+void mul_mat_f32_0(
+    const float * restrict src0, // M x K
+    const float * restrict src1, // N x K (transposed)
+    float * dst,
+    int m, int n, int k) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            float sum = 0;
+            for (int l = 0; l < k; l++) {
+                sum += src0[i*k + l] * src1[j*k + l];
+            }
+            dst[j*m + i] = sum;
+        }
+    }
+}
+
+int main(int argc, const char ** argv) {
+    if (argc < 4) {
+        printf("Usage: %s M N K\n", argv[0]);
+        return 1;
+    }
+
+    const int n_threads = 1;
+
+    int M = atoi(argv[1]);
+    int N = atoi(argv[2]);
+    int K = atoi(argv[3]);
+
+    srand(time(NULL));
+
+    if (M == 0) M = rand() % 1000 + 1;
+    if (N == 0) N = rand() % 1000 + 1;
+    if (K == 0) K = rand() % 1000 + 1;
+
+    printf("M = %d, N = %d, K = %d\n", M, N, K);
+
+    float * src0 = malloc(sizeof(float)*M*K);
+    float * src1 = malloc(sizeof(float)*N*K);
+    float * dst0 = malloc(sizeof(float)*M*N); // naive
+    float * dst1 = malloc(sizeof(float)*M*N); // blas
+
+    struct ggml_init_params params = {
+        .mem_size   = 2048ul*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * s0_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, M);
+    struct ggml_tensor * s1_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, N);
+
+    struct ggml_tensor * s0_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, M);
+    struct ggml_tensor * s1_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, N);
+
+    for (int j = 0; j < M; j++) {
+        for (int i = 0; i < K; i++) {
+            //src0[j*K + i] = j;
+            src0[j*K + i] = 1e-3*(rand() % 1000);
+        }
+    }
+
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < K; i++) {
+            //src1[j*K + i] = j + 1;
+            src1[j*K + i] = 1e-3*(rand() % 1000);
+        }
+    }
+
+    // copy src0 to s0_f32
+    {
+        float       * p_f32 = s0_f32->data;
+        ggml_fp16_t * p_f16 = s0_f16->data;
+        for (int i = 0; i < M; i++) {
+            for (int j = 0; j < K; j++) {
+                p_f32[i*K + j] = src0[i*K + j];
+                p_f16[i*K + j] = ggml_fp32_to_fp16(src0[i*K + j]);
+            }
+        }
+    }
+
+    // copy src1 to s1_f32
+    {
+        float       * p_f32 = s1_f32->data;
+        ggml_fp16_t * p_f16 = s1_f16->data;
+        for (int i = 0; i < N; i++) {
+            for (int j = 0; j < K; j++) {
+                p_f32[i*K + j] = src1[i*K + j];
+                p_f16[i*K + j] = ggml_fp32_to_fp16(src1[i*K + j]);
+            }
+        }
+    }
+
+    const clock_t start = clock();
+    const uint64_t start_us = get_time_us();
+
+    double iM = 1.0/M;
+    mul_mat_f32_0(src0, src1, dst0, M, N, K);
+
+    // Use BLAS sgemm from Accelerate framework
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, N, M, K, 1.0f, src1, K, src0, K, 0.0f, dst1, M);
+
+    struct ggml_tensor * dst2 = NULL;
+    struct ggml_tensor * dst3 = NULL;
+
+    {
+        dst2 = ggml_mul_mat(ctx0, s0_f32, s1_f32);
+
+        struct ggml_cgraph gf = ggml_build_forward(dst2);
+        ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    }
+
+    {
+        dst3 = ggml_mul_mat(ctx0, s0_f16, s1_f32);
+
+        struct ggml_cgraph gf = ggml_build_forward(dst3);
+        ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    }
+
+    bool ok_blas = true;
+    bool ok_ggml_f32 = true;
+    bool ok_ggml_f16 = true;
+
+    // check BLAS
+    for (int i = 0; i < M*N; i++) {
+        if (fabs(dst0[i] - dst1[i])/fabs(dst0[i]) > 0.0001) {
+            printf("dst0[%d] = %f, dst1[%d] = %f\n", i, dst0[i], i, dst1[i]);
+            ok_blas = false;
+        }
+    }
+
+    // check ggml (f32)
+    {
+        float * p = dst2->data;
+        for (int i = 0; i < M*N; i++) {
+            if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.0001) {
+                printf("dst0[%d] = %f, dst2[%d] = %f\n", i, dst0[i], i, p[i]);
+                ok_ggml_f32 = false;
+            }
+        }
+    }
+
+    // check ggml (f16)
+    {
+        float * p = dst3->data;
+        for (int i = 0; i < M*N; i++) {
+            if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.01) {
+                printf("dst0[%d] = %f, dst3[%d] = %f\n", i, dst0[i], i, p[i]);
+                ok_ggml_f16 = false;
+            }
+        }
+    }
+
+    {
+        const clock_t end = clock();
+        const uint64_t end_us = get_time_us();
+        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
+    }
+
+#if 0
+    // print src0
+    printf("src0:\n");
+    for (int i = 0; i < M; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%4.1f ", src0[i*K+j]);
+        }
+        printf("\n");
+    }
+
+    // print src1
+    printf("src1:\n");
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%4.1f ", src1[i*K+j]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst0 (naive):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", dst0[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst1 (BLAS):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", dst1[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst2 (ggml f32):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", ((float *)dst2->data)[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+    printf("dst3 (ggml f16):\n");
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            printf("%4.1f ", ((float *)dst3->data)[j*M+i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+#endif
+
+    free(src0);
+    free(src1);
+    free(dst0);
+    free(dst1);
+
+    ggml_free(ctx0);
+
+    printf("ok_blas = %d\n", ok_blas);
+    if (!ok_blas) {
+        printf("ERROR: BLAS failed\n");
+    }
+
+    printf("ok_ggml_f32 = %d\n", ok_ggml_f32);
+    if (!ok_ggml_f32) {
+        printf("ERROR: ggml failed\n");
+    }
+
+    printf("ok_ggml_f16 = %d\n", ok_ggml_f16);
+    if (!ok_ggml_f16) {
+        printf("ERROR: ggml failed\n");
+    }
+
+    return (ok_blas && ok_ggml_f32 && ok_ggml_f16) ? 0 : 1;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-conv-transpose.c b/stable-diffusion.cpp/ggml/tests/test-conv-transpose.c
new file mode 100644
index 0000000000000000000000000000000000000000..fbd7c89249d9b2d6d328f7f67e6de9ddcd7e117e
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-conv-transpose.c
@@ -0,0 +1,236 @@
+#include "ggml/ggml.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct ggml_context* make_ctx(void) {
+    struct ggml_init_params params = {
+        .mem_size = 2 * 1024 * 1024,
+    };
+
+    return ggml_init(params);
+}
+
+void printf_tensor(struct ggml_tensor * t) {
+    if (t->type == GGML_TYPE_F32) {
+        const float * t_d = ggml_get_data_f32(t);
+        for (int i = 0; i < t->ne[2]; ++i) {
+            for (int j = 0; j < t->ne[1]; ++j) {
+                for (int k = 0; k < t->ne[0]; ++k) {
+                    printf("%.1f ", t_d[i * t->ne[1] * t->ne[0] + j * t->ne[0] + k]);
+                }
+                printf("\n");
+            }
+            printf("---\n");
+        }
+    }
+    else if (t->type == GGML_TYPE_F16) {
+        const ggml_fp16_t * t_d = ggml_get_data(t);
+        for (int i = 0; i < t->ne[2]; ++i) {
+            for (int j = 0; j < t->ne[1]; ++j) {
+                for (int k = 0; k < t->ne[0]; ++k) {
+                    printf("%.1f ", ggml_fp16_to_fp32(t_d[i * t->ne[1] * t->ne[0] + j * t->ne[0] + k]));
+                }
+                printf("\n");
+            }
+            printf("---\n");
+        }
+    }
+    else {
+        printf("unknown type\n");
+    }
+}
+
+void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) {
+    GGML_ASSERT(t->type == GGML_TYPE_F32);
+    GGML_ASSERT(t->ne[0] == ne0);
+    GGML_ASSERT(t->ne[1] == ne1);
+    GGML_ASSERT(t->ne[2] == ne2);
+    for (int i2 = 0; i2 < ne2; ++i2) {
+        for (int i1 = 0; i1 < ne1; ++i1) {
+            for (int i0 = 0; i0 < ne0; ++i0) {
+                float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0);
+                float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0];
+                GGML_ASSERT(expected == actual);
+            }
+        }
+    }
+}
+
+void test_conv_transpose_1d(void) {
+
+    float buf_f32[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf_f32[i] = (float)i;
+    }
+
+    ggml_fp16_t buf_f16[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf_f16[i] = ggml_fp32_to_fp16((float)i);
+    }
+
+    float expected_out_1[3][4] = {
+        {18.0, 45.0, 59.0, 37.0},
+        {24.0, 61.0, 83.0, 51.0},
+        {30.0, 77.0, 107.0, 65.0},
+    };
+    float expected_out_2[3][6] = {
+        {18.0, 21.0, 24.0, 29.0, 30.0, 37.0},
+        {24.0, 27.0, 34.0, 39.0, 44.0, 51.0},
+        {30.0, 33.0, 44.0, 49.0, 58.0, 65.0},
+    };
+    float expected_out_3[3][8] = {
+        {18.0, 21.0, 0.0, 24.0, 29.0, 0.0, 30.0, 37.0},
+        {24.0, 27.0, 0.0, 34.0, 39.0, 0.0, 44.0, 51.0},
+        {30.0, 33.0, 0.0, 44.0, 49.0, 0.0, 58.0, 65.0},
+    };
+
+    // conv transpose 1d with stride 1, 2 & 3
+    {
+        struct ggml_context * ctx = make_ctx();
+
+        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 3, 2); // l x cin
+        memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+        struct ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 2, 3, 2); // k x cout x cin
+        memcpy(k->data, buf_f16, ggml_nbytes(k));
+
+        struct ggml_tensor * out_1 = ggml_conv_transpose_1d(ctx, k, t, 1 /* s0 */, 0 /* p0 */, 1 /* d0 */);
+        struct ggml_tensor * out_2 = ggml_conv_transpose_1d(ctx, k, t, 2 /* s0 */, 0 /* p0 */, 1 /* d0 */);
+        struct ggml_tensor * out_3 = ggml_conv_transpose_1d(ctx, k, t, 3 /* s0 */, 0 /* p0 */, 1 /* d0 */);
+
+        struct ggml_cgraph gf_1 = ggml_build_forward(out_1);
+        struct ggml_cgraph gf_2 = ggml_build_forward(out_2);
+        struct ggml_cgraph gf_3 = ggml_build_forward(out_3);
+
+        ggml_graph_compute_with_ctx(ctx, &gf_1, 1);
+        ggml_graph_compute_with_ctx(ctx, &gf_2, 1);
+        ggml_graph_compute_with_ctx(ctx, &gf_3, 1);
+
+        check_tensor(out_1, (float*)expected_out_1, 4, 3, 1);
+        check_tensor(out_2, (float*)expected_out_2, 6, 3, 1);
+        check_tensor(out_3, (float*)expected_out_3, 8, 3, 1);
+    }
+}
+
+void test_conv_transpose_2d(void) {
+
+    float buf_f32[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf_f32[i] = (float)i;
+    }
+
+    ggml_fp16_t buf_f16[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf_f16[i] = ggml_fp32_to_fp16((float)i);
+    }
+
+    float expected_out_1[3][3][4] = {
+        {
+            {72.0, 162.0, 188.0, 106.0},
+            {192.0, 430.0, 490.0, 274.0},
+            {132.0, 292.0, 326.0, 180.0},
+        },
+        {
+            {96.0, 218.0, 260.0, 146.0},
+            {264.0, 590.0, 682.0, 378.0},
+            {180.0, 396.0, 446.0, 244.0},
+        },
+        {
+            {120.0, 274.0, 332.0, 186.0},
+            {336.0, 750.0, 874.0, 482.0},
+            {228.0, 500.0, 566.0, 308.0},
+        },
+    };
+
+    float expected_out_2[3][4][6] = {
+        {
+            {72.0, 78.0, 84.0, 92.0, 96.0, 106.0},
+            {84.0, 90.0, 100.0, 108.0, 116.0, 126.0},
+            {108.0, 120.0, 120.0, 134.0, 132.0, 148.0},
+            {132.0, 144.0, 148.0, 162.0, 164.0, 180.0},
+        },
+        {
+            {96.0, 102.0, 116.0, 124.0, 136.0, 146.0},
+            {108.0, 114.0, 132.0, 140.0, 156.0, 166.0},
+            {156.0, 168.0, 176.0, 190.0, 196.0, 212.0},
+            {180.0, 192.0, 204.0, 218.0, 228.0, 244.0},
+        },
+        {
+            {120.0, 126.0, 148.0, 156.0, 176.0, 186.0},
+            {132.0, 138.0, 164.0, 172.0, 196.0, 206.0},
+            {204.0, 216.0, 232.0, 246.0, 260.0, 276.0},
+            {228.0, 240.0, 260.0, 274.0, 292.0, 308.0},
+        },
+    };
+
+    float expected_out_3[3][5][8] = {
+        {
+            {72.0, 78.0, 0.0, 84.0, 92.0, 0.0, 96.0, 106.0},
+            {84.0, 90.0, 0.0, 100.0, 108.0, 0.0, 116.0, 126.0},
+            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+            {108.0, 120.0, 0.0, 120.0, 134.0, 0.0, 132.0, 148.0},
+            {132.0, 144.0, 0.0, 148.0, 162.0, 0.0, 164.0, 180.0},
+        },
+        {
+            {96.0, 102.0, 0.0, 116.0, 124.0, 0.0, 136.0, 146.0},
+            {108.0, 114.0, 0.0, 132.0, 140.0, 0.0, 156.0, 166.0},
+            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+            {156.0, 168.0, 0.0, 176.0, 190.0, 0.0, 196.0, 212.0},
+            {180.0, 192.0, 0.0, 204.0, 218.0, 0.0, 228.0, 244.0},
+        },
+        {
+            {120.0, 126.0, 0.0, 148.0, 156.0, 0.0, 176.0, 186.0},
+            {132.0, 138.0, 0.0, 164.0, 172.0, 0.0, 196.0, 206.0},
+            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+            {204.0, 216.0, 0.0, 232.0, 246.0, 0.0, 260.0, 276.0},
+            {228.0, 240.0, 0.0, 260.0, 274.0, 0.0, 292.0, 308.0},
+        },
+    };
+
+    // conv transpose 2d with stride 1, 2 & 3
+    {
+        struct ggml_context * ctx = make_ctx();
+
+        struct ggml_tensor * t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3, 2, 2, 1); // w x h x cin
+        memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+        struct ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 2, 2, 3, 2); // w x h cin x cout
+        memcpy(k->data, buf_f16, ggml_nbytes(k));
+
+        struct ggml_tensor * out_1 = ggml_conv_transpose_2d_p0(ctx, k, t, 1);
+        struct ggml_tensor * out_2 = ggml_conv_transpose_2d_p0(ctx, k, t, 2);
+        struct ggml_tensor * out_3 = ggml_conv_transpose_2d_p0(ctx, k, t, 3);
+
+        struct ggml_cgraph gf_1 = ggml_build_forward(out_1);
+        struct ggml_cgraph gf_2 = ggml_build_forward(out_2);
+        struct ggml_cgraph gf_3 = ggml_build_forward(out_3);
+
+        ggml_graph_compute_with_ctx(ctx, &gf_1, 1);
+        ggml_graph_compute_with_ctx(ctx, &gf_2, 1);
+        ggml_graph_compute_with_ctx(ctx, &gf_3, 1);
+
+        // printf("in\n");
+        // printf_tensor(t);
+        // printf("\n\nkernel\n");
+        // printf_tensor(k);
+        // printf("\n\nout\n");
+        // printf_tensor(out);
+        // printf("\n\nout_2\n");
+        // printf_tensor(out_2);
+        // printf("\n\nout_3\n");
+        // printf_tensor(out_3);
+
+        check_tensor(out_1, (float*)expected_out_1, 4, 3, 3);
+        check_tensor(out_2, (float*)expected_out_2, 6, 4, 3);
+        check_tensor(out_3, (float*)expected_out_3, 8, 5, 3);
+
+    }
+}
+
+int main(int argc, const char * argv[]) {
+    test_conv_transpose_1d();
+    test_conv_transpose_2d();
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-customop.c b/stable-diffusion.cpp/ggml/tests/test-customop.c
new file mode 100644
index 0000000000000000000000000000000000000000..ec261ec83584d67838730aa2db1c2b671a3fb220
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-customop.c
@@ -0,0 +1,223 @@
+#include "ggml/ggml.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+typedef volatile LONG atomic_int;
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
+    return InterlockedExchangeAdd(ptr, inc);
+}
+#else
+#include <stdatomic.h>
+#endif
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+struct ggml_context * make_ctx(void) {
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 1 * 1024 * 1024,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    return ggml_init(params);
+}
+
+char g_userdata[] = "ggml";
+atomic_int g_custom1_count = 0;
+atomic_int g_custom2_count = 0;
+atomic_int g_custom3_count = 0;
+
+void custom1(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata) {
+    // check that the userdata is correct
+    assert(userdata == NULL);
+    assert(ggml_are_same_shape(dst, a));
+
+    atomic_fetch_add(&g_custom1_count, 1);
+
+    const float * a_data = ggml_get_data_f32(a);
+    float * dst_data = ggml_get_data_f32(dst);
+
+    // this assumes that the tensors are contiguous
+    assert(ggml_is_contiguous(dst));
+    assert(ggml_is_contiguous(a));
+
+    // parallelize by elements
+    const int ne = (int)ggml_nelements(dst);
+    const int dr = (ne + nth - 1) / nth;
+    const int ie0 = dr * ith;
+    const int ie1 = MIN(ie0 + dr, ne);
+
+    for (int i = ie0; i < ie1; ++i) {
+        dst_data[i] = a_data[i] * 2;
+    }
+}
+
+void custom2(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata) {
+    // check that the userdata is correct
+    assert(userdata == g_userdata);
+    assert(strcmp(userdata, "ggml") == 0);
+    assert(ggml_are_same_shape(dst, a));
+    assert(ggml_are_same_shape(dst, b));
+
+    atomic_fetch_add(&g_custom2_count, 1);
+
+    const float * a_data = ggml_get_data_f32(a);
+    const float * b_data = ggml_get_data_f32(b);
+    float * dst_data = ggml_get_data_f32(dst);
+
+    // parallelize by rows
+    const int nr = (int)ggml_nrows(dst);
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // number of columns
+    const int nc = (int)dst->ne[0];
+
+    // this assumes that the tensors are contiguous
+    assert(ggml_is_contiguous(dst));
+    assert(ggml_is_contiguous(a));
+    assert(ggml_is_contiguous(b));
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        for (int ic = 0; ic < nc; ++ic) {
+            const int i = ir * nc + ic;
+            dst_data[i] = a_data[i] + b_data[i];
+        }
+    }
+}
+
+void custom3(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata) {
+    // check that the userdata is correct
+    assert(userdata == g_userdata);
+    assert(strcmp(userdata, "ggml") == 0);
+    assert(ggml_are_same_shape(dst, a));
+    assert(ggml_are_same_shape(dst, b));
+    assert(ggml_are_same_shape(dst, c));
+
+    atomic_fetch_add(&g_custom3_count, 1);
+
+    const float * a_data = ggml_get_data_f32(a);
+    const float * b_data = ggml_get_data_f32(b);
+    const float * c_data = ggml_get_data_f32(c);
+    float * dst_data = ggml_get_data_f32(dst);
+
+    // dont parallelize
+    assert(ith == 0);
+
+    // number of elements
+    const int ne = (int)ggml_nelements(dst);
+
+    // this assumes that the tensors are contiguous
+    assert(ggml_is_contiguous(dst));
+    assert(ggml_is_contiguous(a));
+    assert(ggml_is_contiguous(b));
+    assert(ggml_is_contiguous(c));
+
+    for (int i = 0; i < ne; ++i) {
+        dst_data[i] = a_data[i] + b_data[i] + c_data[i];
+    }
+}
+
+int main(int argc, const char** argv) {
+
+    float buf1_f32[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf1_f32[i] = (float)(i + 1);
+    }
+    float buf2_f32[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf2_f32[i] = (float)(i + 1) * 2;
+    }
+    float buf3_f32[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf3_f32[i] = (float)(i + 1) * 3;
+    }
+
+    // map_custom1
+    // 2 tasks, no userdata, parallelized by elements
+    {
+        struct ggml_context * ctx = make_ctx();
+        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t->data, buf1_f32, ggml_nbytes(t));
+
+        struct ggml_tensor * m1 = ggml_map_custom1(ctx, t, custom1, 2, NULL);
+
+        struct ggml_cgraph graph = ggml_build_forward(m1);
+
+        ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+        const float * output = ggml_get_data_f32(m1);
+
+        for (int i = 0; i < ggml_nelements(m1); ++i) {
+            assert(output[i] == buf1_f32[i] * 2);
+        }
+        assert(g_custom1_count == 2);
+
+        ggml_free(ctx);
+    }
+
+    // map_custom2
+    // max tasks (4), userdata, parallelized by rows
+    {
+        struct ggml_context * ctx = make_ctx();
+        struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t1->data, buf1_f32, ggml_nbytes(t1));
+        struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t2->data, buf2_f32, ggml_nbytes(t2));
+
+        struct ggml_tensor * m2 = ggml_map_custom2(ctx, t1, t2, custom2, GGML_N_TASKS_MAX, g_userdata);
+
+        struct ggml_cgraph graph = ggml_build_forward(m2);
+
+        ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+        const float * output = ggml_get_data_f32(m2);
+
+        for (int i = 0; i < ggml_nelements(m2); ++i) {
+            assert(output[i] == buf1_f32[i] + buf2_f32[i]);
+        }
+
+        assert(g_custom2_count == 4);
+
+        ggml_free(ctx);
+    }
+
+    // map_custom3
+    // 1 task, userdata, not parallelized
+    {
+        struct ggml_context * ctx = make_ctx();
+        struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t1->data, buf1_f32, ggml_nbytes(t1));
+        struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t2->data, buf2_f32, ggml_nbytes(t2));
+        struct ggml_tensor * t3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t3->data, buf3_f32, ggml_nbytes(t3));
+
+        struct ggml_tensor * m3 = ggml_map_custom3(ctx, t1, t2, t3, custom3, 1, g_userdata);
+
+        struct ggml_cgraph graph = ggml_build_forward(m3);
+
+        ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+        const float * output = ggml_get_data_f32(m3);
+
+        for (int i = 0; i < ggml_nelements(m3); ++i) {
+            assert(output[i] == buf1_f32[i] + buf2_f32[i] + buf3_f32[i]);
+        }
+
+        assert(g_custom3_count == 1);
+
+        ggml_free(ctx);
+    }
+
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-grad0.cpp b/stable-diffusion.cpp/ggml/tests/test-grad0.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a559b27ab370e7af3dac9d111bc60a48b812782
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-grad0.cpp
@@ -0,0 +1,1608 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#define MAX_NARGS 3
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define GGML_SILU_FP16
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+static float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+static int irand(int n) {
+    if (n == 0) return 0;
+    return rand()%n;
+}
+
+static void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+static struct ggml_tensor * get_random_tensor_f32(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    }
+
+    return result;
+}
+
+static struct ggml_tensor * get_random_tensor_f16(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    }
+
+    return result;
+}
+
+static struct ggml_tensor * get_random_tensor_i32(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        int32_t imin,
+        int32_t imax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    }
+
+    return result;
+}
+
+static bool check_gradient(
+        const char * op_name,
+        struct ggml_context * ctx0,
+        struct ggml_tensor * x[],
+        struct ggml_tensor * f,
+        int ndims,
+        int nargs,
+        float eps,
+        float max_error_abs,
+        float max_error_rel) {
+
+    static int n_threads = -1;
+    if (n_threads < 0) {
+        n_threads = GGML_DEFAULT_N_THREADS;
+
+        const char *env = getenv("GGML_N_THREADS");
+        if (env) {
+            n_threads = atoi(env);
+        }
+
+        printf("GGML_N_THREADS = %d\n", n_threads);
+    }
+
+    struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
+    struct ggml_cgraph * gb = ggml_new_graph(ctx0);
+    *gb = *gf;
+    ggml_build_backward_expand(ctx0, gf, gb, false);
+
+    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+
+    ggml_graph_reset  (gf);
+    ggml_set_f32      (f->grad, 1.0f);
+
+    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
+
+    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
+    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
+
+    for (int i = 0; i < nargs; ++i) {
+        const int nelements = ggml_nelements(x[i]);
+        for (int k = 0; k < nelements; ++k) {
+            // compute gradient using finite differences
+            const float x0 = ggml_get_f32_1d(x[i], k);
+            const float xm = x0 - eps;
+            const float xp = x0 + eps;
+            ggml_set_f32_1d(x[i], k, xp);
+
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+
+            const double f0 = ggml_get_f32_1d(f, 0);
+
+            ggml_set_f32_1d(x[i], k, xm);
+
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+
+            const double f1 = ggml_get_f32_1d(f, 0);
+            const double g0 = (f0 - f1)/(2.0*(double) eps);
+
+            ggml_set_f32_1d(x[i], k, x0);
+
+            // compute gradient using backward graph
+            ggml_graph_reset  (gf);
+            ggml_set_f32      (f->grad, 1.0f);
+
+            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
+
+            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
+
+            const double error_abs = fabs(g0 - g1);
+            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
+
+            if (error_abs > max_error_abs || error_rel > max_error_rel) {
+                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
+                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
+                //assert(false);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+// TODO: clean-up this ..
+static bool check_mat_mul(
+        const struct ggml_tensor * y,
+        const struct ggml_tensor * x0,
+        const struct ggml_tensor * x1) {
+    float * dst  = (float *) y->data;
+    float * src0 = (float *) x0->data;
+    float * src1 = (float *) x1->data;
+
+    const int nc = x0->ne[1];
+    const int nr = x1->ne[1];
+    const int nk = x0->ne[0];
+
+    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
+
+    GGML_PRINT_DEBUG("x0:\n");
+    for (int j = 0; j < x0->ne[1]; ++j) {
+        for (int i = 0; i < x0->ne[0]; ++i) {
+            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+    GGML_PRINT_DEBUG("\n");
+
+    GGML_PRINT_DEBUG("x1:\n");
+    for (int j = 0; j < x1->ne[1]; ++j) {
+        for (int i = 0; i < x1->ne[0]; ++i) {
+            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+    GGML_PRINT_DEBUG("\n");
+
+    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
+    for (int j = 0; j < y->ne[1]; ++j) {
+        for (int i = 0; i < y->ne[0]; ++i) {
+            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+
+    for (int i = 0; i < nr; ++i) {
+        for (int j = 0; j < nc; ++j) {
+            float sum = 0.0f;
+
+            for (int k = 0; k < nk; ++k) {
+                sum += src0[j*nk + k]*src1[i*nk + k];
+            }
+
+            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
+                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
+                assert(false);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+#define NUM_PERMUTATIONS (4*3*2*1)
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 256*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    int64_t ne[4];
+
+    int all_permutations[4 * NUM_PERMUTATIONS];
+    {
+        int count = 0;
+        for (int ax0=0; ax0<4; ++ax0) {
+            for (int ax1=0; ax1<4; ++ax1) {
+                if (ax1 == ax0) continue;
+                for (int ax2=0; ax2<4; ++ax2) {
+                    if (ax2 == ax0) continue;
+                    if (ax2 == ax1) continue;
+                    for (int ax3=0; ax3<4; ++ax3) {
+                        if (ax3 == ax0) continue;
+                        if (ax3 == ax1) continue;
+                        if (ax3 == ax2) continue;
+                        assert(count < NUM_PERMUTATIONS);
+                        all_permutations[count*4+0] = ax0;
+                        all_permutations[count*4+1] = ax1;
+                        all_permutations[count*4+2] = ax2;
+                        all_permutations[count*4+3] = ax3;
+                        ++count;
+                    }
+                }
+            }
+        }
+    }
+
+    unsigned seed_iter = 1;
+
+    // original loop: 1000
+    int niter = 4;
+    const char *env = getenv("GGML_NLOOP");
+    if (env != NULL) {
+        niter = atoi(env);
+    }
+    if (argc > 1) {
+        niter = atoi(argv[1]);
+    }
+    for (int iter = 0; iter < niter; ++iter) {
+        srand(seed_iter);
+        seed_iter = rand();
+        unsigned seed = rand();
+
+        printf("test-grad0: iter:%d/%d\n", iter, niter);
+        struct ggml_context * ctx0 = ggml_init(params);
+
+        get_random_dims(ne, 4);
+
+        struct ggml_tensor * x[MAX_NARGS];
+
+        // add f32
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+            }
+        }
+
+        // add f16
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
+            }
+        }
+
+        // sub
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
+
+                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // mul
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
+
+                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // div
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
+
+                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
+            }
+        }
+
+        // sqr
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
+
+                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // sqrt
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
+
+                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
+            }
+        }
+
+        // log
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
+
+                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+            }
+        }
+
+        // sum
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
+
+                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+
+        // sum_rows
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
+
+                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
+        }
+
+        // mean, not yet fully implemented
+        if(0)
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
+
+                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // argmax
+        if (0)
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
+
+                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // repeat
+        {
+            srand(seed);
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            ne2[0] = ne[0] * ne2[0];
+            ne2[1] = ne[1] * ne2[1];
+            ne2[2] = 1;
+            ne2[3] = 1;
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
+
+                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
+        }
+
+        // repeat back
+        {
+            srand(seed);
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            ne2[0] = ne[0] * ne2[0];
+            ne2[1] = ne[1] * ne2[1];
+            ne2[2] = 1;
+            ne2[3] = 1;
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
+
+                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
+        }
+
+        // abs (finite differences do not work)
+        //{
+        //    const int nargs = 1;
+
+        //    for (int ndims = 1; ndims <= 2; ++ndims) {
+        //        for (int i = 0; i < nargs; ++i) {
+        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+        //            ggml_set_param(ctx0, x[i]);
+        //        }
+
+        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
+
+        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
+        //    }
+        //}
+
+        // sgn
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
+
+                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // neg
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
+
+                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // step
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
+
+                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // tanh, not yet fully implemented
+        if(0)
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
+
+                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // mul_mat
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+                int max_nrep = (ndims >= 3) ? 2 : 1;
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
+                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
+                        {
+                            int64_t ne2[4];
+                            get_random_dims(ne2, 4);
+                            ne2[0] = ne[0];
+                            ne2[2] = nrep2 * ne[2];
+                            ne2[3] = nrep3 * ne[3];
+                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                        }
+
+                        ggml_set_param(ctx0, x[0]);
+                        ggml_set_param(ctx0, x[1]);
+
+                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
+                        struct ggml_tensor * f = ggml_sum(ctx0, m);
+
+                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
+
+                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                        if (ndims == 2) {
+                            // check_mat_mul does not support ndims > 2
+                            check_mat_mul(m, x[1], x[0]);
+                        }
+                    }
+                }
+            }
+        }
+
+        // elu, not yet fully implemented
+        if(0)
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
+
+                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // relu
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
+
+                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // gelu, not yet fully implemented
+        if(0)
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
+
+                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // silu
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
+
+#ifdef GGML_SILU_FP16
+                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
+                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
+#else
+                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+#endif
+            }
+        }
+
+        // rms_norm
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
+
+                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
+            }
+        }
+
+        // scale
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            int64_t ne2[4];
+            ne2[0] = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+                ggml_set_param(ctx0, x[1]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
+
+                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cpy f32
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cpy f16
+        {
+            srand(seed);
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+            }
+        }
+
+        // reshape (1d->nd)
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                int64_t ne2[4];
+                ne2[0] = 1;
+                ne2[1] = 1;
+                ne2[2] = 1;
+                ne2[3] = 1;
+                for (int i = 0; i < ndims; ++i) {
+                    ne2[0] *= ne[i];
+                }
+                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
+                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // reshape (nd->1d)
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                int64_t ne2[4];
+                ne2[0] = 1;
+                ne2[1] = 1;
+                ne2[2] = 1;
+                ne2[3] = 1;
+                for (int i = 0; i < ndims; ++i) {
+                    ne2[0] *= ne[i];
+                }
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
+                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 1d
+        {
+            srand(seed);
+            int64_t ne2[4] = { 1, 1, 1, 1 };
+
+            const int nargs = 2;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 1);
+                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 1);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
+                const int offset = irand(max_offset) * ggml_element_size(x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 2d
+        {
+            srand(seed);
+            int64_t ne2[4]         = { 1, 1, 1, 1 };
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 2;
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 2);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 2);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                const int offset = offsets[0] + offsets[1];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 3d
+        {
+            srand(seed);
+            int64_t ne2[4]         = { 1, 1, 1, 1 };
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 2;
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 3);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 3);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
+                const int offset = offsets[0] + offsets[1] + offsets[2];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 4d
+        {
+            srand(seed);
+            int64_t ne2[4]         = { 1, 1, 1, 1 };
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 2;
+            for (int ndims = 4; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 4);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 4);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
+                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
+                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
+                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // set_1d
+        {
+            srand(seed);
+            int64_t ne2[4];
+
+            const int nargs = 2;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 1);
+                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 1);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
+                const int offset = irand(max_offset) * ggml_element_size(x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
+
+                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // set_2d
+        {
+            srand(seed);
+            int64_t ne2[4];
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 1;
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 2);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 2);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                const int offset = offsets[0] + offsets[1];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
+
+                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // view_1d
+        {
+            srand(seed);
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int k0 = irand(ggml_nelements(x[0]));
+                const int k1 = irand(ggml_nelements(x[0]));
+                const int i0 = MIN(k0, k1);
+                const int i1 = MAX(k0, k1);
+
+                const int offset = i0 * sizeof(float);
+                const int nelem  = i1 - i0;
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
+
+                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // view_2d
+        {
+            srand(seed);
+            int64_t ne2[4];
+            int64_t nb2[4];
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                get_random_dims(ne2, 2);
+                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
+                    get_random_dims(ne2, 2);
+                }
+                const int count = ne2[0]*ne2[1];
+
+                nb2[0] = sizeof(float);
+                nb2[1] = nb2[0]*ne2[0];
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int max_offset = ggml_nelements(x[0]) - count;
+                const int offset = irand(max_offset+1) * sizeof(float);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
+
+                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // view_3d
+        {
+            srand(seed);
+            int64_t ne2[4] = {1,1,1,1};
+            int64_t nb2[4] = {0,0,0,0};
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                get_random_dims(ne2, 3);
+                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
+                    get_random_dims(ne2, 3);
+                }
+                const int count = ne2[0]*ne2[1]*ne2[2];
+
+                nb2[0] = sizeof(float);
+                nb2[1] = nb2[0]*ne2[0];
+                nb2[2] = nb2[1]*ne2[1];
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int max_offset = ggml_nelements(x[0]) - count;
+                const int offset = irand(max_offset+1) * sizeof(float);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
+
+                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // permute
+        {
+            srand(seed);
+            int64_t ne2[4];
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims)
+            {
+                // ggml_permute will set axes of dimensions below n_dims to 1.
+                // to make ggml_permute work correctly on all axes,
+                // the input tensor needs maximal n_dim of 4.
+                for (int i=0; i<ndims; ++i) {
+                    ne2[i] = ne[i];
+                }
+                for (int i=ndims; i<4; ++i) {
+                    ne2[i] = 1;
+                }
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int p = irand(NUM_PERMUTATIONS);
+                const int ax0 = all_permutations[p*4+0];
+                const int ax1 = all_permutations[p*4+1];
+                const int ax2 = all_permutations[p*4+2];
+                const int ax3 = all_permutations[p*4+3];
+
+                // sum requires contiguous tensor rows
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
+
+                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // transpose
+        {
+            srand(seed);
+            int64_t ne2[4];
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims)
+            {
+                // ggml_transpose will set axes of dimensions below n_dims to 1.
+                // to make ggml_transpose work correctly on all axes,
+                // the input tensor needs maximal n_dim of 4.
+                for (int i=0; i<ndims; ++i) {
+                    ne2[i] = ne[i];
+                }
+                for (int i=ndims; i<4; ++i) {
+                    ne2[i] = 1;
+                }
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+
+                // sum requires contiguous tensor rows
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
+
+                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // get_rows
+        {
+            srand(seed);
+            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
+            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
+            const int nargs = 1;
+            const int ndims = 2;
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
+
+            ggml_set_param(ctx0, x[0]);
+
+            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
+
+            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+        }
+
+        // diag_mask_inf
+        {
+            srand(seed);
+            const int nargs = 1;
+            const int ndims = 2;
+
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+            ggml_set_param(ctx0, x[0]);
+
+            int n_past = irand(ne[0]);
+
+            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
+
+            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+        }
+
+        // diag_mask_zero
+        {
+            srand(seed);
+            const int nargs = 1;
+            const int ndims = 2;
+
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+            ggml_set_param(ctx0, x[0]);
+
+            int n_past = irand(ne[0]);
+
+            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
+
+            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+        }
+
+        // softmax
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            for (int ndims = 1; ndims <= 3; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                float eps = 1e-6f;
+                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
+                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
+                struct ggml_tensor * f = ggml_sum(ctx0,
+                                            ggml_log(ctx0,
+                                                ggml_add1(ctx0,
+                                                    ggml_scale(ctx0,
+                                                        ggml_soft_max(ctx0, x[0]),
+                                                        ggml_new_f32(ctx0, 1.0f - eps)),
+                                                    ggml_new_f32(ctx0, eps))));
+
+                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
+                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
+                // this may result in different gradients too finite differences.
+                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
+                // if only the table lookup causes gradients to differ this is acceptable.
+            }
+        }
+
+        // cross_entropy_loss
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
+                // the second argument to cross_entropy_loss must sum up to 1 for each row
+                int nr = ggml_nrows(x[1]);
+                int nc = ggml_nelements(x[1]) / nr;
+                for (int ir = 0; ir < nr; ++ir) {
+                    float sum = 0;
+                    for (int ic = 0; ic < nc; ++ic) {
+                        sum += ((float *) x[1]->data)[ic + ir*nc];
+                    }
+                    for (int ic = 0; ic < nc; ++ic) {
+                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
+                    }
+                }
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
+
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
+            }
+        }
+
+        // rope f32
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+            ne2[0] += ne2[0] % 2;
+            int n_rot = ne2[0];
+
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+                for (int mode = 0; mode < 4; ++mode) {
+                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        for (int i = 0; i < ne2[2]; ++i) {
+                            ((int32_t *) p->data)[i] = n_past + i;
+                        }
+
+                        ggml_set_param(ctx0, x[0]);
+
+                        const bool skip_past = (mode & 1);
+                        if (skip_past) {
+                            // we have no past, so this would have to work on uninitialized memory.
+                            // we only test the gradients here;
+                            // skip_past should have no influence on gradient computation.
+                            // so when other modes work, we assume that this does as well.
+                            continue;
+                        }
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
+
+                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // rope f16
+        {
+            srand(seed);
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+            ne2[0] += ne2[0] % 2;
+            int n_rot = ne2[0];
+
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+                for (int mode = 0; mode < 4; ++mode) {
+                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        for (int i = 0; i < ne2[2]; ++i) {
+                            ((int32_t *) p->data)[i] = n_past + i;
+                        }
+
+                        ggml_set_param(ctx0, x[0]);
+
+                        const bool skip_past = (mode & 1);
+                        if (skip_past) {
+                            // we have no past, so this would have to work on uninitialized memory.
+                            // we only test the gradients here;
+                            // skip_past should have no influence on gradient computation.
+                            // so when other modes work, we assume that this does as well.
+                            continue;
+                        }
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
+
+                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // flash_attn f32
+        {
+            srand(seed);
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int max_nrep = (ndims >= 3) ? 2 : 1;
+                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
+                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
+                        int64_t nek[4] = { D, M, B, ne[3] };
+                        int64_t nev[4] = { M, D, B, ne[3] };
+                        if (ndims == 2) {
+                            neq[2] = 1; neq[3] = 1;
+                            nek[2] = 1; nek[3] = 1;
+                            nev[2] = 1; nev[3] = 1;
+                        } else if (ndims == 3) {
+                            neq[3] = 1;
+                            nek[3] = 1;
+                            nev[3] = 1;
+                        }
+                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                        ggml_set_param(ctx0, x[0]);
+                        ggml_set_param(ctx0, x[1]);
+                        ggml_set_param(ctx0, x[2]);
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // flash_attn f16, not yet fully implemented
+        if(0)
+        {
+            srand(seed);
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int64_t nek[4] = { D, M, B, ne[3] };
+                    int64_t nev[4] = { M, D, B, ne[3] };
+                    if (ndims == 2) {
+                        neq[2] = 1; neq[3] = 1;
+                        nek[2] = 1; nek[3] = 1;
+                        nev[2] = 1; nev[3] = 1;
+                    } else if (ndims == 3) {
+                        neq[3] = 1;
+                        nek[3] = 1;
+                        nev[3] = 1;
+                    }
+                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    ggml_set_param(ctx0, x[0]);
+                    ggml_set_param(ctx0, x[1]);
+                    ggml_set_param(ctx0, x[2]);
+
+                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+                }
+            }
+        }
+        ggml_free(ctx0);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-mul-mat0.c b/stable-diffusion.cpp/ggml/tests/test-mul-mat0.c
new file mode 100644
index 0000000000000000000000000000000000000000..6212da41a6bb144c10a5f0ce66a9556c52de889b
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-mul-mat0.c
@@ -0,0 +1,332 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#include "ggml/ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define MAX_NARGS 2
+
+float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+int irand(int n) {
+    return rand()%n;
+}
+
+void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+struct ggml_tensor * get_random_tensor(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+float get_element(const struct ggml_tensor * t, int idx) {
+    return ((float *)t->data)[idx];
+}
+
+void set_element(struct ggml_tensor * t, int idx, float value) {
+    ((float *)t->data)[idx] = value;
+}
+
+bool check_gradient(
+        const char * op_name,
+        struct ggml_context * ctx0,
+        struct ggml_tensor * x[],
+        struct ggml_tensor * f,
+        int ndims,
+        int nargs,
+        float eps,
+        float max_error_abs,
+        float max_error_rel) {
+    const int n_threads = 1;
+
+    struct ggml_cgraph gf = ggml_build_forward (f);
+    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    ggml_graph_reset  (&gf);
+    ggml_set_f32      (f->grad, 1.0f);
+    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+    ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
+    ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
+
+    for (int i = 0; i < nargs; ++i) {
+        const int64_t nelements = ggml_nelements(x[i]);
+        for (int64_t k = 0; k < nelements; ++k) {
+            // compute gradient using finite differences
+            const float x0 = get_element(x[i], k);
+
+            set_element(x[i], k, x0 + eps);
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+            const float f0 = ggml_get_f32_1d(f, 0);
+
+            set_element(x[i], k, x0 - eps);
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+            const float f1 = ggml_get_f32_1d(f, 0);
+
+            const float g0 = (f0 - f1)/(2.0f*eps);
+
+            set_element(x[i], k, x0);
+
+            // compute gradient using backward graph
+            ggml_graph_reset  (&gf);
+            ggml_set_f32      (f->grad, 1.0f);
+            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+            const float g1 = get_element(x[i]->grad, k);
+
+            const float error_abs = fabsf(g0 - g1);
+            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
+
+            if (error_abs > max_error_abs || error_rel > max_error_rel) {
+                printf("%s: ndims=%d, i=%d, k=%" PRId64 ", g0=%f, g1=%f, error_abs=%f, error_rel=%f\n", op_name, ndims, i, k, g0, g1, error_abs, error_rel);
+                assert(false);
+            }
+        }
+    }
+
+    return true;
+}
+
+
+float mat_get(const struct ggml_tensor * t, int i0, int i1, int i2, int i3) {
+    const size_t nb0 = t->nb[0];
+    const size_t nb1 = t->nb[1];
+    const size_t nb2 = t->nb[2];
+    const size_t nb3 = t->nb[3];
+
+    return
+        *((float*) ((char*)t->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3));
+}
+
+bool check_mat_mul(
+        const struct ggml_tensor * y,
+        const struct ggml_tensor * x0,
+        const struct ggml_tensor * x1) {
+    const int64_t n00 = x0->ne[0];
+    const int64_t n10 = x0->ne[1];
+    const int64_t n20 = x0->ne[2];
+    const int64_t n30 = x0->ne[3];
+
+    const int64_t n01 = x1->ne[0];
+    const int64_t n11 = x1->ne[1];
+    const int64_t n21 = x1->ne[2];
+    const int64_t n31 = x1->ne[3];
+
+    const int64_t n02 = y->ne[0];
+    const int64_t n12 = y->ne[1];
+    const int64_t n22 = y->ne[2];
+    const int64_t n32 = y->ne[3];
+
+    printf("x0: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n00, n10, n20, n30);
+    for (int j = 0; j < n10; ++j) {
+        for (int i = 0; i < n00; ++i) {
+            printf("%6.3f ", mat_get(x0, i, j, 0, 0));
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    printf("x1: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n01, n11, n21, n31);
+    for (int j = 0; j < n11; ++j) {
+        for (int i = 0; i < n01; ++i) {
+            printf("%6.3f ", mat_get(x1, i, j, 0, 0));
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    printf("y: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n02, n12, n22, n32);
+    for (int j = 0; j < n12; ++j) {
+        for (int i = 0; i < n02; ++i) {
+            printf("%6.3f ", mat_get(y, i, j, 0, 0));
+        }
+        printf("\n");
+    }
+
+    for (int i3 = 0; i3 < n32; ++i3) {
+        for (int i2 = 0; i2 < n22; ++i2) {
+            for (int i1 = 0; i1 < n12; ++i1) {
+                for (int i0 = 0; i0 < n02; ++i0) {
+                    float sum = 0.0f;
+                    for (int k = 0; k < n00; ++k) {
+                        sum += mat_get(x0, k, i0, i2, i3) * mat_get(x1, k, i1, i2, i3);
+                    }
+                    if (fabsf(sum - mat_get(y, i0, i1, i2, i3)) > 1e-5) {
+                        printf("error: i0=%d, i1=%d, i2=%d, i3=%d, sum=%f, y=%f\n",
+                                i0, i1, i2, i3, sum, mat_get(y, i0, i1, i2, i3));
+                        assert(false);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    int64_t ne[4];
+
+    // original loop: 500
+    int niter = 500;
+    const char *env = getenv("GGML_NLOOP");
+    if (env != NULL) {
+        niter = atoi(env);
+    }
+    if (argc > 1) {
+        niter = atoi(argv[1]);
+    }
+
+    int n_threads = 1;
+
+    for (int iter = 0; iter < niter; ++iter) {
+        printf("test-mul-mat0: iter:%d/%d\n", iter, niter);
+        struct ggml_context * ctx0 = ggml_init(params);
+
+        get_random_dims(ne, 4);
+
+        struct ggml_tensor * x[MAX_NARGS];
+
+        // mul_mat
+        {
+            const int nargs = 1;
+
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                ne[1] = rand()%4 + 1;
+                x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
+                struct ggml_tensor * f = ggml_sum(ctx0, m);
+
+                printf("testing: mul_mat, [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] = [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] * [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n",
+                           m->ne[0],    m->ne[1],    m->ne[2],    m->ne[3],
+                        x[1]->ne[0], x[1]->ne[1], x[1]->ne[2], x[1]->ne[3],
+                        x[0]->ne[0], x[0]->ne[1], x[0]->ne[2], x[0]->ne[3]);
+
+                assert(m->ne[0] == x[1]->ne[1]);
+                assert(m->ne[1] == x[0]->ne[1]);
+                assert(m->ne[2] == x[0]->ne[2]);
+                assert(m->ne[3] == x[0]->ne[3]);
+
+                if (ndims <= 2) {
+                    check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                } else {
+                    struct ggml_cgraph gf = ggml_build_forward(m);
+                    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+                }
+
+                check_mat_mul(m, x[1], x[0]);
+            }
+        }
+
+        // mul_mat (transposed)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                ne[1] = ne[0];
+                ne[0] = rand()%4 + 1;
+                x[1] = ggml_cont(ctx0, ggml_transpose(ctx0, get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f)));
+
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
+                struct ggml_tensor * f = ggml_sum(ctx0, m);
+
+                printf("testing: mul_mat, [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] = [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] * [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n",
+                           m->ne[0],    m->ne[1],    m->ne[2],    m->ne[3],
+                        x[1]->ne[0], x[1]->ne[1], x[1]->ne[2], x[1]->ne[3],
+                        x[0]->ne[0], x[0]->ne[1], x[0]->ne[2], x[0]->ne[3]);
+
+                assert(m->ne[0] == x[1]->ne[1]);
+                assert(m->ne[1] == x[0]->ne[1]);
+                assert(m->ne[2] == x[0]->ne[2]);
+                assert(m->ne[3] == x[0]->ne[3]);
+
+                if (ndims <= 2) {
+                    check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                } else {
+                    struct ggml_cgraph gf = ggml_build_forward(m);
+                    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+                }
+
+                check_mat_mul(m, x[1], x[0]);
+            }
+        }
+        ggml_free(ctx0);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-mul-mat1.c b/stable-diffusion.cpp/ggml/tests/test-mul-mat1.c
new file mode 100644
index 0000000000000000000000000000000000000000..b725a5872c7f4c7910961a60c8fa58e1506bda65
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-mul-mat1.c
@@ -0,0 +1,312 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include <sys/time.h>
+
+#include <arm_neon.h>
+
+#include <Accelerate/Accelerate.h>
+
+const int M = 1280;
+const int N = 1536;
+const int K = 1280;
+
+uint64_t get_time_us(void) {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+//
+// naive implementation
+//
+
+void mul_mat_f32_0(
+    const float * restrict src0, // M x K
+    const float * restrict src1, // N x K (transposed)
+    float * dst,
+    int m, int n, int k) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            float sum = 0;
+            for (int l = 0; l < k; l++) {
+                sum += src0[i*k + l] * src1[j*k + l];
+            }
+            dst[i*n + j] = sum;
+        }
+    }
+}
+
+void mul_mat_f16_0(
+    const __fp16 * src0,
+    const __fp16 * src1,
+           float * dst,
+    int m, int n, int k) {
+    const int k32 = k & ~31;
+
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            float sumf = 0.0;
+
+            float16x8_t sum0 = vdupq_n_f16(0.0f);
+            float16x8_t sum1 = vdupq_n_f16(0.0f);
+            float16x8_t sum2 = vdupq_n_f16(0.0f);
+            float16x8_t sum3 = vdupq_n_f16(0.0f);
+
+            float16x8_t x0, x1, x2, x3;
+            float16x8_t y0, y1, y2, y3;
+
+            const __fp16 * restrict p0 = src0 + i*k;
+            const __fp16 * restrict p1 = src1 + j*k;
+
+            for (int l = 0; l < k32; l += 32) {
+                x0 = vld1q_f16(p0 + l + 0 );
+                x1 = vld1q_f16(p0 + l + 8 );
+                x2 = vld1q_f16(p0 + l + 16);
+                x3 = vld1q_f16(p0 + l + 24);
+
+                y0 = vld1q_f16(p1 + l + 0 );
+                y1 = vld1q_f16(p1 + l + 8 );
+                y2 = vld1q_f16(p1 + l + 16);
+                y3 = vld1q_f16(p1 + l + 24);
+
+                sum0 = vfmaq_f16(sum0, x0, y0);
+                sum1 = vfmaq_f16(sum1, x1, y1);
+                sum2 = vfmaq_f16(sum2, x2, y2);
+                sum3 = vfmaq_f16(sum3, x3, y3);
+            }
+
+            // reduce sum0..sum3 to sum0
+            sum0 = vaddq_f16(sum0, sum1);
+            sum2 = vaddq_f16(sum2, sum3);
+            sum0 = vaddq_f16(sum0, sum2);
+
+            // load sum0 into 2 float32x4_t
+            float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
+            float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
+
+            // reduce sum0f32 and sum1f32 to sumf
+            sum0f32 = vaddq_f32(sum0f32, sum1f32);
+
+            float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
+            sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
+
+            //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7];
+
+            for (int l = k32; l < k32; l++) {
+                sumf += p0[l]*p1[l];
+            }
+
+            dst[i*n + j] = sumf;
+        }
+    }
+}
+
+// blocking with block size 32
+void mul_mat_f16_1(
+    const __fp16 * src0,
+    const __fp16 * src1,
+           float * dst,
+    int m, int n, int k) {
+
+    const int k32 = k & ~31;
+    const int bs  = 32;
+
+    memset(dst, 0, m*n*sizeof(float));
+
+    for (int i = 0; i < m; i += bs) {
+        for (int j = 0; j < n; j += bs) {
+            for (int l = 0; l < k; l += bs) {
+                for (int ii = i; ii < i + bs; ii++) {
+                    const __fp16 * restrict p0 = src0 + ii*k;
+
+                    float16x8_t x0, x1, x2, x3;
+
+                    x0 = vld1q_f16(p0 + l + 0 );
+                    x1 = vld1q_f16(p0 + l + 8 );
+                    x2 = vld1q_f16(p0 + l + 16);
+                    x3 = vld1q_f16(p0 + l + 24);
+
+                    for (int jj = j; jj < j + bs; jj++) {
+                        float sumf = 0.0;
+
+                        float16x8_t sum0 = vdupq_n_f16(0.0f);
+                        float16x8_t sum1 = vdupq_n_f16(0.0f);
+                        float16x8_t sum2 = vdupq_n_f16(0.0f);
+                        float16x8_t sum3 = vdupq_n_f16(0.0f);
+
+                        float16x8_t y0, y1, y2, y3;
+
+                        const __fp16 * restrict p1 = src1 + jj*k;
+
+                        y0 = vld1q_f16(p1 + l + 0 );
+                        y1 = vld1q_f16(p1 + l + 8 );
+                        y2 = vld1q_f16(p1 + l + 16);
+                        y3 = vld1q_f16(p1 + l + 24);
+
+                        sum0 = vfmaq_f16(sum0, x0, y0);
+                        sum1 = vfmaq_f16(sum1, x1, y1);
+                        sum2 = vfmaq_f16(sum2, x2, y2);
+                        sum3 = vfmaq_f16(sum3, x3, y3);
+
+                        // reduce sum0..sum3 to sum0
+                        sum0 = vaddq_f16(sum0, sum1);
+                        sum2 = vaddq_f16(sum2, sum3);
+                        sum0 = vaddq_f16(sum0, sum2);
+
+                        // load sum0 into 2 float32x4_t
+                        float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
+                        float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
+
+                        // reduce sum0f32 and sum1f32 to sumf
+                        sum0f32 = vaddq_f32(sum0f32, sum1f32);
+
+                        float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
+                        sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
+
+                        //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7];
+
+                        dst[ii*n + jj] += sumf;
+                    }
+                }
+            }
+        }
+    }
+
+}
+
+void mul_mat_f8_0(
+    const uint8_t * src0,
+    const uint8_t * src1,
+           float * dst,
+    int m, int n, int k) {
+    const int k32 = k & ~31;
+
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            float sumf = 0.0;
+
+            const uint8_t * restrict p0 = src0 + i*k;
+            const uint8_t * restrict p1 = src1 + j*k;
+
+            for (int l = 0; l < k32; l += 32) {
+                uint8x16_t x0 = vld1q_u8(p0 + l + 0 );
+                uint8x16_t x1 = vld1q_u8(p0 + l + 16);
+
+                uint8x16_t y0 = vld1q_u8(p1 + l + 0 );
+                uint8x16_t y1 = vld1q_u8(p1 + l + 16);
+
+                x0 = vmulq_u8(x0, y0);
+                x1 = vmulq_u8(x1, y1);
+
+                sumf += vaddvq_u8(x0) + vaddvq_u8(x1);
+            }
+
+            dst[i*n + j] = sumf;
+        }
+    }
+}
+
+int main(int argc, const char ** argv) {
+    float * src0 = malloc(sizeof(float)*M*K);
+    float * src1 = malloc(sizeof(float)*N*K);
+    float * dst  = malloc(sizeof(float)*M*N);
+
+    for (int i = 0; i < M*K; i++) {
+        src0[i] = rand() / (float)RAND_MAX;
+    }
+
+    for (int i = 0; i < N*K; i++) {
+        src1[i] = rand() / (float)RAND_MAX;
+    }
+
+    // convert src0 and src1 to __fp16
+    __fp16 * src0_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*M*K));
+    __fp16 * src1_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*N*K));
+
+    uint8_t * src0_fp8 = (uint8_t *)(malloc(sizeof(__fp16)*M*K));
+    uint8_t * src1_fp8 = (uint8_t *)(malloc(sizeof(__fp16)*N*K));
+
+    {
+        const uint64_t t_start = get_time_us();
+
+        for (int i = 0; i < M*K; i++) {
+            src0_fp16[i] = src0[i];
+            //printf("%f %f\n", src0[i], src0_fp16[i]);
+            //assert(!isnan(src0_fp16[i]));
+        }
+
+        for (int i = 0; i < N*K; i++) {
+            src1_fp16[i] = src1[i];
+        }
+
+        const uint64_t t_end = get_time_us();
+        printf("convert time: %f ms\n", (t_end - t_start) / 1000.0);
+    }
+
+    for (int i = 0; i < 16; ++i) {
+        printf("%f %f\n", src0[i], src0_fp16[i]);
+    }
+
+    int method = 0;
+    if (argc > 1) {
+        method = atoi(argv[1]);
+    }
+
+    const int nIter = 1;
+
+    const clock_t start = clock();
+    const uint64_t start_us = get_time_us();
+
+    double iM = 1.0/M;
+    double sum = 0.0f;
+    for (int i = 0; i < nIter; i++) {
+        if (method == 0) {
+            mul_mat_f32_0(src0, src1, dst, M, N, K);
+        }
+
+        if (method == 1) {
+            mul_mat_f16_0(src0_fp16, src1_fp16, dst, M, N, K);
+        }
+
+        if (method == 2) {
+            mul_mat_f16_1(src0_fp16, src1_fp16, dst, M, N, K);
+        }
+
+        if (method == 3) {
+            mul_mat_f8_0(src0_fp8, src1_fp8, dst, M, N, K);
+        }
+
+        if (method == 4) {
+            // Use BLAS sgemm from Accelerate framework
+            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0f, src0, K, src1, K, 0.0f, dst, N);
+        }
+    }
+
+    for (int i = 0; i < N; i++) {
+        sum += dst[i]*iM;
+    }
+
+    {
+        const clock_t end = clock();
+        const uint64_t end_us = get_time_us();
+        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
+        printf("%s: elapsed us:    %llu / %f ms\n",  __func__, end_us - start_us, (end_us - start_us) / 1000.0 / nIter);
+    }
+
+    printf("%f\n", sum);
+
+    free(src0);
+    free(src1);
+    free(dst);
+
+    free(src0_fp16);
+    free(src1_fp16);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-mul-mat2.c b/stable-diffusion.cpp/ggml/tests/test-mul-mat2.c
new file mode 100644
index 0000000000000000000000000000000000000000..89af28636dd42cf7a0fdd029477e88c393dbe72d
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-mul-mat2.c
@@ -0,0 +1,2585 @@
+// quantized matrix multiplication
+
+#include "ggml.h"
+
+#include <float.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#if defined(__ARM_NEON)
+#include "arm_neon.h"
+#elif defined(__AVX__) || defined(__AVX2__)
+#include "immintrin.h"
+#endif
+
+#ifndef MIN
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#include <intrin.h>
+#define __builtin_popcountll __popcnt64
+#endif
+
+const int M = 1280;
+const int N = 1536;
+const int K = 1280;
+
+//const int M = 64;
+//const int N = 64;
+//const int K = 64;
+
+#define QK 64
+#define QB 4
+
+//#define GGML_GQ_USE_FP16_SCALE
+
+#if defined(GGML_GQ_USE_FP16_SCALE)
+#define gq_scale_t ggml_fp16_t
+#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x)
+#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x)
+#else
+#define gq_scale_t float
+#define GGML_FP32_TO_GQ(x) (x)
+#define GGML_GQ_TO_FP32(x) (x)
+#endif
+
+#define gq_t_bits 64
+#define gq_quant_t uint64_t
+
+float frand(void) {
+    return (float) rand() / (float) RAND_MAX;
+}
+
+#if defined(__AVX2__)
+// horizontally reduce 8 32-bit integers
+static inline uint32_t _mm256_hadd_epi32_gg(__m256i v) {
+    __m128i v0 = _mm256_extractf128_si256(v, 0);
+    __m128i v1 = _mm256_extractf128_si256(v, 1);
+
+    v0 = _mm_add_epi32(v0, v1);
+
+    v1 = _mm_shuffle_epi32(v0, 0x0e);
+    v0 = _mm_add_epi32(v0, v1);
+
+    v1 = _mm_shuffle_epi32(v0, 0x01);
+    v0 = _mm_add_epi32(v0, v1);
+
+    return _mm_cvtsi128_si32(v0);
+}
+
+//static inline float _mm256_hadd_epi32_gg(__m256i v) {
+//    const __m256 v0 = _mm256_cvtepi32_ps(v);
+//    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 1));
+//    const __m128 t1 = _mm_hadd_ps(t0, t0);
+//
+//    return _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
+//}
+
+// horizontally reduce 32 8-bit integers
+static inline int32_t _mm256_hadd_epi8_gg(__m256i v0) {
+    __m256i v1 = _mm256_maddubs_epi16(v0, _mm256_set1_epi8(1));
+    __m256i v2 = _mm256_madd_epi16   (v1, _mm256_set1_epi16(1));
+
+    return _mm256_hadd_epi32_gg(v2);
+}
+
+static inline float _mm256_hadd_ps_gg(__m256 v) {
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
+    const __m128 t1 = _mm_hadd_ps(t0, t0);
+
+    return _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
+}
+#endif
+
+//
+// naive implementation
+//
+
+void mul_mat_f32_naive(
+    const float * restrict src0, // M x K
+    const float * restrict src1, // N x K (transposed)
+    float * dst,
+    int m, int n, int k) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            float sum = 0;
+            for (int l = 0; l < k; l++) {
+                sum += src0[i*k + l] * src1[j*k + l];
+            }
+            dst[i*n + j] = sum;
+        }
+    }
+}
+
+//
+// method 1
+//
+
+static inline int quantize_1_blocks_per_row(int k) {
+    return k/QK;
+}
+
+static inline int quantize_1_quants_per_block(void) {
+    return QK/gq_t_bits;
+}
+
+static inline int quantize_1_row_size(int k) {
+    const int nb = quantize_1_blocks_per_row(k);
+    const int nq = quantize_1_quants_per_block();
+
+    return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
+}
+
+void quantize_1(const float * src, void * dst, int n, int k) {
+    char * p0 = dst;
+
+    gq_quant_t pp[QB];
+
+    for (int j = 0; j < n; j++) {
+        for (int i = 0; i < k/QK; i++) {
+            float min = FLT_MAX;
+            float max = -FLT_MAX;
+
+            // find min/max
+#ifdef __ARM_NEON
+            {
+                float32x4_t minv = vdupq_n_f32(FLT_MAX);
+                float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
+
+                for (int l = 0; l < QK; l += 4) {
+                    float32x4_t v = vld1q_f32(src + j*k + i*QK + l);
+                    minv = vminq_f32(minv, v);
+                    maxv = vmaxq_f32(maxv, v);
+                }
+
+                float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
+                float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
+
+                min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
+                max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
+
+                //printf("SIMD min/max: %f %f\n", min, max);
+            }
+#else
+            {
+                for (int l = 0; l < QK; l++) {
+                    const float v = src[j*k + i*QK + l];
+                    if (v < min) min = v;
+                    if (v > max) max = v;
+                }
+
+                //printf("NORM min/max: %f %f\n", min, max);
+            }
+#endif
+
+            const float d = (max - min) / ((1 << QB) - 1);
+            const float id = d ? 1.0/d : 0.0;
+
+            memcpy(p0, &min, sizeof(float)); p0 += sizeof(float);
+            memcpy(p0, &d,   sizeof(float)); p0 += sizeof(float);
+
+            //printf("min/max/d/id: %f %f %f %f\n", min, max, d, id);
+
+            for (int s = 0; s < QK/gq_t_bits; ++s) {
+                memset(pp, 0, sizeof(pp));
+
+                for (int l = 0; l < gq_t_bits; l++) {
+                    const   float v = src[j*k + i*QK + s*gq_t_bits + l];
+                    const uint8_t q = (v - min)*id;
+
+                    for (int b = 0; b < QB; b++) {
+                        pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
+                    }
+                }
+
+                for (int b = 0; b < QB; b++) {
+                    memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t);
+                }
+            }
+        }
+    }
+}
+
+void mul_mat_gq_1(
+    const void * src0,
+    const void * src1,
+         float * dst,
+    int m, int n, int k) {
+    const int kp = k & ~(gq_t_bits - 1);
+
+    const char * restrict p0 = src0;
+    const char * restrict p1 = src1;
+
+    float s0[QB + 1];
+    float s1[QB + 1];
+
+    gq_quant_t m0[QB + 1];
+    gq_quant_t m1[QB + 1];
+
+    for (int ir0 = 0; ir0 < m; ir0++) {
+        for (int ir1 = 0; ir1 < n; ir1++) {
+            float sumf = 0.0;
+
+            const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
+            const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
+
+            for (int i = 0; i < kp/QK; i++) {
+                float min0, d0;
+                memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float);
+                memcpy(&d0,   pp0, sizeof(float)); pp0 += sizeof(float);
+
+                float min1, d1;
+                memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float);
+                memcpy(&d1,   pp1, sizeof(float)); pp1 += sizeof(float);
+
+                //printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1);
+
+#if 1
+                // >>> General case for any QB
+
+                s0[0] = min0;
+                s1[0] = min1;
+
+                for (int b = 0; b < QB; b++) {
+                    s0[b + 1] = d0*(1 << b);
+                    s1[b + 1] = d1*(1 << b);
+                }
+
+                m0[0] = 0-1ULL;
+                m1[0] = 0-1ULL;
+
+                for (int s = 0; s < QK/gq_t_bits; ++s) {
+                    for (int b = 0; b < QB; b++) {
+                        memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t);
+                        memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t);
+                    }
+
+                    for (int q0 = 0; q0 < QB + 1; q0++) {
+                        for (int q1 = 0; q1 < QB + 1; q1++) {
+                            sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]);
+                        }
+                    }
+                }
+#else
+#endif
+            }
+
+            dst[ir0*n + ir1] = sumf;
+        }
+    }
+}
+
+//
+// method 2
+// n-bit quantization (2nd attempt)
+//
+
+static inline int quantize_2_blocks_per_row(int k) {
+    return k/QK;
+}
+
+static inline int quantize_2_quants_per_block(void) {
+    return QK/gq_t_bits;
+}
+
+static inline int quantize_2_row_size(int k) {
+    const int nb = quantize_2_blocks_per_row(k);
+    const int nq = quantize_2_quants_per_block();
+
+    return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
+}
+
+void quantize_2_row(const float * restrict src, void * restrict dst, int k) {
+    assert(k % QK == 0);
+
+    const int nb = quantize_2_blocks_per_row(k);
+    const int nq = quantize_2_quants_per_block();
+
+    gq_scale_t * restrict pm = (gq_scale_t *) (dst);
+    gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
+    gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
+
+    gq_quant_t pp[QB];
+
+    static const int32_t sh[32] = {
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    };
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+#ifdef __ARM_NEON
+        {
+            float32x4_t minv = vdupq_n_f32(FLT_MAX);
+            float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
+
+            for (int l = 0; l < QK; l += 4) {
+                float32x4_t v = vld1q_f32(src + i*QK + l);
+                minv = vminq_f32(minv, v);
+                maxv = vmaxq_f32(maxv, v);
+            }
+
+            float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
+            float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
+
+            min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
+            max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
+        }
+#else
+        {
+            for (int l = 0; l < QK; l++) {
+                const float v = src[i*QK + l];
+                if (v < min) min = v;
+                if (v > max) max = v;
+            }
+        }
+#endif
+
+        const float d = (max - min) / ((1 << QB) - 1);
+        const float id = d ? 1.0/d : 0.0;
+
+        pm[i] = GGML_FP32_TO_GQ(min);
+        pd[i] = GGML_FP32_TO_GQ(d);
+
+        for (int s = 0; s < nq; ++s) {
+            memset(pp, 0, sizeof(pp));
+
+#if 1
+            for (int l = 0; l < gq_t_bits; l++) {
+                const   float v = src[i*QK + s*gq_t_bits + l];
+                const uint8_t q = (v - min)*id + frand();
+
+                for (int b = 0; b < QB; b++) {
+                    pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
+                }
+            }
+#elif defined(__ARM_NEON)
+#if 1
+            {
+                uint32_t ppt[2*4*QB];
+
+                float32x4_t minv = vdupq_n_f32(min);
+                float32x4_t idv  = vdupq_n_f32(id);
+
+                assert(gq_t_bits % 16 == 0);
+
+                uint32x4_t p0[QB] = { vdupq_n_u32(0) };
+                uint32x4_t p1[QB] = { vdupq_n_u32(0) };
+
+                for (int l = 0; l < gq_t_bits; l += 16) {
+                    float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0);
+                    float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4);
+                    float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8);
+                    float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12);
+
+                    v0 = vsubq_f32(v0, minv);
+                    v1 = vsubq_f32(v1, minv);
+                    v2 = vsubq_f32(v2, minv);
+                    v3 = vsubq_f32(v3, minv);
+
+                    v0 = vmulq_f32(v0, idv);
+                    v1 = vmulq_f32(v1, idv);
+                    v2 = vmulq_f32(v2, idv);
+                    v3 = vmulq_f32(v3, idv);
+
+#if 1
+                    v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand();
+                    v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand();
+                    v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand();
+                    v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand();
+#endif
+
+                    uint32x4_t q0 = vcvtq_u32_f32(v0);
+                    uint32x4_t q1 = vcvtq_u32_f32(v1);
+                    uint32x4_t q2 = vcvtq_u32_f32(v2);
+                    uint32x4_t q3 = vcvtq_u32_f32(v3);
+
+                    for (int b = 0; b < QB; ++b) {
+                        uint32x4_t m = vdupq_n_u32(1 << b);
+                        uint32x4_t r = vdupq_n_u32(-b);
+
+                        if (l < 32) {
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l + 0)));
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l + 4)));
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l + 8)));
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l + 12)));
+                        } else {
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l - 32)));
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l - 28)));
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l - 24)));
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l - 20)));
+                        }
+                    }
+                }
+
+#if QB == 4
+                vst1q_u32((uint32_t *) ppt + 0,  p0[0]);
+                vst1q_u32((uint32_t *) ppt + 4,  p1[0]);
+                vst1q_u32((uint32_t *) ppt + 8,  p0[1]);
+                vst1q_u32((uint32_t *) ppt + 12, p1[1]);
+                vst1q_u32((uint32_t *) ppt + 16, p0[2]);
+                vst1q_u32((uint32_t *) ppt + 20, p1[2]);
+                vst1q_u32((uint32_t *) ppt + 24, p0[3]);
+                vst1q_u32((uint32_t *) ppt + 28, p1[3]);
+
+                pp[0] = (ppt[0]  | ppt[1]  | ppt[2]  | ppt[3] ) | ((uint64_t) (ppt[4]  | ppt[5]  | ppt[6]  | ppt[7]) ) << 32;
+                pp[1] = (ppt[8]  | ppt[9]  | ppt[10] | ppt[11]) | ((uint64_t) (ppt[12] | ppt[13] | ppt[14] | ppt[15])) << 32;
+                pp[2] = (ppt[16] | ppt[17] | ppt[18] | ppt[19]) | ((uint64_t) (ppt[20] | ppt[21] | ppt[22] | ppt[23])) << 32;
+                pp[3] = (ppt[24] | ppt[25] | ppt[26] | ppt[27]) | ((uint64_t) (ppt[28] | ppt[29] | ppt[30] | ppt[31])) << 32;
+#else
+                for (int b = 0; b < QB; ++b) {
+                    vst1q_u32((uint32_t *) ppt + 0,  p0[b]);
+                    vst1q_u32((uint32_t *) ppt + 4,  p1[b]);
+
+                    pp[b] = (ppt[0] | ppt[1] | ppt[2] | ppt[3]) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7])) << 32;
+                }
+#endif
+            }
+#else
+            // less optimal SIMD
+            {
+                float32x4_t minv = vdupq_n_f32(min);
+                float32x4_t idv  = vdupq_n_f32(id);
+
+                assert(gq_t_bits == 64);
+                uint8_t qq[gq_t_bits];
+
+                for (int l = 0; l < gq_t_bits; l += 16) {
+                    float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0);
+                    float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4);
+                    float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8);
+                    float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12);
+
+                    v0 = vsubq_f32(v0, minv);
+                    v1 = vsubq_f32(v1, minv);
+                    v2 = vsubq_f32(v2, minv);
+                    v3 = vsubq_f32(v3, minv);
+
+                    v0 = vmulq_f32(v0, idv);
+                    v1 = vmulq_f32(v1, idv);
+                    v2 = vmulq_f32(v2, idv);
+                    v3 = vmulq_f32(v3, idv);
+
+#if 0
+                    v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand();
+                    v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand();
+                    v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand();
+                    v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand();
+#endif
+
+                    uint32x4_t q0 = vcvtq_u32_f32(v0);
+                    uint32x4_t q1 = vcvtq_u32_f32(v1);
+                    uint32x4_t q2 = vcvtq_u32_f32(v2);
+                    uint32x4_t q3 = vcvtq_u32_f32(v3);
+
+                    // store in qq as uint8_t
+                    vst1_u8(qq + l + 0, vmovn_u16(vcombine_u16(vmovn_u32(q0), vmovn_u32(q1))));
+                    vst1_u8(qq + l + 8, vmovn_u16(vcombine_u16(vmovn_u32(q2), vmovn_u32(q3))));
+                }
+
+                for (int l = 0; l < gq_t_bits; l++) {
+                    for (int b = 0; b < QB; b++) {
+                        const uint64_t ql = qq[l];
+                        /*pp[b] |= qq[l] & (1 << b) ? (1ULL << l) : 0;*/
+                        pp[b] |= ((ql & (1 << b)) >> b) << l;
+                    }
+                }
+            }
+#endif
+#endif
+            memcpy(pb + i*nq*QB + s*QB, pp, sizeof(pp));
+        }
+    }
+}
+
+// reimplementation of quantize_2 using quantize_2_row
+void quantize_2(const float * restrict src, char * restrict dst, int n, int k) {
+    assert(k % QK == 0);
+
+    for (int j = 0; j < n; j++) {
+        quantize_2_row(src + j*k, dst, k);
+        dst = (char *) dst + quantize_2_row_size(k);
+    }
+}
+
+void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+    const int nb = quantize_2_blocks_per_row(n);
+    const int nq = quantize_2_quants_per_block();
+
+    const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
+    const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
+
+    const gq_scale_t * restrict pd0 = pm0 + nb;
+    const gq_scale_t * restrict pd1 = pm1 + nb;
+
+    const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
+    const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
+
+    float sumf = 0.0;
+
+#if 1
+    for (int i = 0; i < nb; i++) {
+        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+
+        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+#if QB == 4
+        int isum01 = 0;
+        int isum10 = 0;
+        int isum11 = 0;
+
+        for (int s = 0; s < nq; ++s) {
+            const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB;
+            const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB;
+
+#define bpcnt(x) __builtin_popcountll(x)
+            isum01 += (1 << 0)*(bpcnt(mm1[0]));
+            isum01 += (1 << 1)*(bpcnt(mm1[1]));
+            isum01 += (1 << 2)*(bpcnt(mm1[2]));
+            isum01 += (1 << 3)*(bpcnt(mm1[3]));
+
+            isum10 += (1 << 0)*(bpcnt(mm0[0]));
+            isum10 += (1 << 1)*(bpcnt(mm0[1]));
+            isum10 += (1 << 2)*(bpcnt(mm0[2]));
+            isum10 += (1 << 3)*(bpcnt(mm0[3]));
+
+            isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0]));
+            isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0]));
+            isum11 += (1 << 2)*(bpcnt(mm0[0] & mm1[2]) + bpcnt(mm0[1] & mm1[1]) + bpcnt(mm0[2] & mm1[0]));
+            isum11 += (1 << 3)*(bpcnt(mm0[0] & mm1[3]) + bpcnt(mm0[1] & mm1[2]) + bpcnt(mm0[2] & mm1[1]) + bpcnt(mm0[3] & mm1[0]));
+            isum11 += (1 << 4)*(bpcnt(mm0[1] & mm1[3]) + bpcnt(mm0[2] & mm1[2]) + bpcnt(mm0[3] & mm1[1]));
+            isum11 += (1 << 5)*(bpcnt(mm0[2] & mm1[3]) + bpcnt(mm0[3] & mm1[2]));
+            isum11 += (1 << 6)*(bpcnt(mm0[3] & mm1[3]));
+#undef bpcnt
+        }
+
+        sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1);
+#elif QB == 3
+        int isum01 = 0;
+        int isum10 = 0;
+        int isum11 = 0;
+
+        for (int s = 0; s < nq; ++s) {
+            const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB;
+            const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB;
+
+#if gq_t_bits == 32
+#define bpcnt(x) __builtin_popcount(x)
+#else
+#define bpcnt(x) __builtin_popcountll(x)
+#endif
+            isum01 += (1 << 0)*(bpcnt(mm1[0]));
+            isum01 += (1 << 1)*(bpcnt(mm1[1]));
+            isum01 += (1 << 2)*(bpcnt(mm1[2]));
+
+            isum10 += (1 << 0)*(bpcnt(mm0[0]));
+            isum10 += (1 << 1)*(bpcnt(mm0[1]));
+            isum10 += (1 << 2)*(bpcnt(mm0[2]));
+
+            isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0]));
+            isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0]));
+            isum11 += (1 << 2)*(bpcnt(mm0[0] & mm1[2]) + bpcnt(mm0[1] & mm1[1]) + bpcnt(mm0[2] & mm1[0]));
+            isum11 += (1 << 3)*(bpcnt(mm0[1] & mm1[2]) + bpcnt(mm0[2] & mm1[1]));
+            isum11 += (1 << 4)*(bpcnt(mm0[2] & mm1[2]));
+#undef bpcnt
+        }
+
+        sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1);
+#elif QB == 2
+        int isum01 = 0;
+        int isum10 = 0;
+        int isum11 = 0;
+
+        for (int s = 0; s < nq; ++s) {
+            const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB;
+            const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB;
+
+#if gq_t_bits == 32
+#define bpcnt(x) __builtin_popcount(x)
+#else
+#define bpcnt(x) __builtin_popcountll(x)
+#endif
+            isum01 += (1 << 0)*(bpcnt(mm1[0]));
+            isum01 += (1 << 1)*(bpcnt(mm1[1]));
+
+            isum10 += (1 << 0)*(bpcnt(mm0[0]));
+            isum10 += (1 << 1)*(bpcnt(mm0[1]));
+
+            isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0]));
+            isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0]));
+            isum11 += (1 << 2)*(bpcnt(mm0[1] & mm1[1]));
+#undef bpcnt
+        }
+
+        sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1);
+#else
+        float s0[QB + 1];
+        float s1[QB + 1];
+
+        s0[0] = m0;
+        s1[0] = m1;
+
+        for (int b = 0; b < QB; b++) {
+            s0[b + 1] = d0*(1 << b);
+            s1[b + 1] = d1*(1 << b);
+        }
+
+        for (int s = 0; s < nq; ++s) {
+            for (int q0 = 0; q0 < QB + 1; q0++) {
+                const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL;
+                for (int q1 = 0; q1 < QB + 1; q1++) {
+                    const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL;
+                    sumf += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1);
+                }
+            }
+        }
+#endif
+    }
+#else
+#error "not implemented"
+#endif
+
+    *s = sumf;
+}
+
+// use vec_dot_gq_2 to compute the dot product of two rows
+void mul_mat_gq_2(
+    const void * src0,
+    const void * src1, // transposed
+         float * dst,
+    int m, int n, int k) {
+    assert(k % QK == 0);
+
+    for (int ir0 = 0; ir0 < m; ir0++) {
+        for (int ir1 = 0; ir1 < n; ir1++) {
+            vec_dot_gq_2(k, dst + ir1, src0, src1);
+            src1 = (const char *) src1 + quantize_2_row_size(k);
+        }
+        src0 = (const char *) src0 +   quantize_2_row_size(k);
+        src1 = (const char *) src1 - n*quantize_2_row_size(k);
+
+        dst = (float *) dst + n;
+    }
+}
+
+//
+// method 3
+// (does not work)
+//
+
+static inline int quantize_3_blocks_per_row(int k) {
+    return k/QK;
+}
+
+static inline int quantize_3_quants_per_block(void) {
+    return QK/gq_t_bits;
+}
+
+static inline int quantize_3_row_size(int k) {
+    const int nb = quantize_3_blocks_per_row(k);
+    const int nq = quantize_3_quants_per_block();
+
+    return nb*(sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
+}
+
+void quantize_3_row(const float * restrict src, void * restrict dst, int k) {
+    assert(k % QK == 0);
+
+    const int nb = quantize_3_blocks_per_row(k);
+    const int nq = quantize_3_quants_per_block();
+
+    gq_scale_t * restrict pd = (gq_scale_t *) (dst);
+    gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
+
+    gq_quant_t pp[QB];
+
+    static const int32_t sh[32] = {
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    };
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // abs max
+
+#ifdef __ARM_NEON
+        {
+            // min / max
+            //float32x4_t minv = vdupq_n_f32(FLT_MAX);
+            //float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
+
+            //for (int l = 0; l < QK; l += 4) {
+            //    float32x4_t v = vld1q_f32(src + i*QK + l);
+            //    minv = vminq_f32(minv, v);
+            //    maxv = vmaxq_f32(maxv, v);
+            //}
+
+            //float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
+            //float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
+
+            //min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
+            //max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
+
+            // abs max
+            float32x4_t amaxv = vdupq_n_f32(0.0f);
+
+            for (int l = 0; l < QK; l += 4) {
+                float32x4_t v = vld1q_f32(src + i*QK + l);
+                amaxv = vmaxq_f32(amaxv, vabsq_f32(v));
+            }
+
+            float32x2_t amaxv32 = vpmax_f32(vget_low_f32(amaxv), vget_high_f32(amaxv));
+
+            amax = MAX(vget_lane_f32(amaxv32, 0), vget_lane_f32(amaxv32, 1));
+        }
+#else
+        {
+            for (int l = 0; l < QK; l++) {
+                const float v = src[i*QK + l];
+                amax = MAX(amax, fabsf(v));
+            }
+        }
+#endif
+
+        const float d = amax / ((1 << (QB - 1)) - 1);
+        const float id = d ? 1.0/d : 0.0;
+
+        pd[i] = GGML_FP32_TO_GQ(d);
+
+        for (int s = 0; s < nq; ++s) {
+            memset(pp, 0, sizeof(pp));
+
+#if 0
+            for (int l = 0; l < gq_t_bits; l++) {
+                const   float v = src[i*QK + s*gq_t_bits + l];
+                const uint8_t q = v*id + frand();
+
+                for (int b = 0; b < QB; b++) {
+                    pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
+                }
+            }
+#elif defined(__ARM_NEON)
+            {
+                uint32_t ppt[2*4*QB];
+
+                float32x4_t idv  = vdupq_n_f32(id);
+
+                assert(gq_t_bits == 64);
+
+                uint32x4_t p0[QB] = { vdupq_n_u32(0) };
+                uint32x4_t p1[QB] = { vdupq_n_u32(0) };
+
+                for (int l = 0; l < gq_t_bits; l += 16) {
+                    float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0);
+                    float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4);
+                    float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8);
+                    float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12);
+
+                    v0 = vmulq_f32(v0, idv);
+                    v1 = vmulq_f32(v1, idv);
+                    v2 = vmulq_f32(v2, idv);
+                    v3 = vmulq_f32(v3, idv);
+
+#if 1
+                    v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand();
+                    v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand();
+                    v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand();
+                    v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand();
+#endif
+
+                    uint32x4_t q0 = vcvtq_u32_f32(v0);
+                    uint32x4_t q1 = vcvtq_u32_f32(v1);
+                    uint32x4_t q2 = vcvtq_u32_f32(v2);
+                    uint32x4_t q3 = vcvtq_u32_f32(v3);
+
+                    for (int b = 0; b < QB; ++b) {
+                        uint32x4_t m = vdupq_n_u32(1 << b);
+                        int32x4_t r = vdupq_n_s32(-b);
+
+                        if (l < 32) {
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l + 0)));
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l + 4)));
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l + 8)));
+                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l + 12)));
+                        } else {
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l - 32)));
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l - 28)));
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l - 24)));
+                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l - 20)));
+                        }
+                    }
+                }
+
+#if QB == 4
+                vst1q_u32((uint32_t *) ppt + 0,  p0[0]);
+                vst1q_u32((uint32_t *) ppt + 4,  p1[0]);
+                vst1q_u32((uint32_t *) ppt + 8,  p0[1]);
+                vst1q_u32((uint32_t *) ppt + 12, p1[1]);
+                vst1q_u32((uint32_t *) ppt + 16, p0[2]);
+                vst1q_u32((uint32_t *) ppt + 20, p1[2]);
+                vst1q_u32((uint32_t *) ppt + 24, p0[3]);
+                vst1q_u32((uint32_t *) ppt + 28, p1[3]);
+
+                pp[0] = (ppt[0]  | ppt[1]  | ppt[2]  | ppt[3] ) | ((uint64_t) (ppt[4]  | ppt[5]  | ppt[6]  | ppt[7]) ) << 32;
+                pp[1] = (ppt[8]  | ppt[9]  | ppt[10] | ppt[11]) | ((uint64_t) (ppt[12] | ppt[13] | ppt[14] | ppt[15])) << 32;
+                pp[2] = (ppt[16] | ppt[17] | ppt[18] | ppt[19]) | ((uint64_t) (ppt[20] | ppt[21] | ppt[22] | ppt[23])) << 32;
+                pp[3] = (ppt[24] | ppt[25] | ppt[26] | ppt[27]) | ((uint64_t) (ppt[28] | ppt[29] | ppt[30] | ppt[31])) << 32;
+#else
+                for (int q = 0; q < QB; ++q) {
+                    vst1q_u32((uint32_t *) ppt + 0,  p0[q]);
+                    vst1q_u32((uint32_t *) ppt + 4,  p1[q]);
+
+                    pp[q] = (ppt[0] | ppt[1] | ppt[2] | ppt[3]) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7])) << 32;
+                }
+#endif
+            }
+#endif
+            memcpy(pb + i*nq*QB + s*QB, pp, sizeof(pp));
+        }
+    }
+}
+
+// reimplementation of quantize_3 using quantize_3_row
+void quantize_3(const float * restrict src, char * restrict dst, int n, int k) {
+    assert(k % QK == 0);
+
+    for (int j = 0; j < n; j++) {
+        quantize_3_row(src + j*k, dst, k);
+        dst = (char *) dst + quantize_3_row_size(k);
+    }
+}
+
+void vec_dot_gq_3(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+    float sumf = 0.0f;
+
+    const int nb = quantize_3_blocks_per_row(n);
+    const int nq = quantize_3_quants_per_block();
+
+    const gq_scale_t * restrict pd0 = (const gq_scale_t *) x;
+    const gq_scale_t * restrict pd1 = (const gq_scale_t *) y;
+
+    const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
+    const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
+
+#if 1
+    for (int i = 0; i < nb; i++) {
+        int isum = 0;
+
+#if QB == 4
+        for (int s = 0; s < nq; ++s) {
+            const gq_quant_t * restrict m0 = pb0 + i*nq*QB + s*QB;
+            const gq_quant_t * restrict m1 = pb1 + i*nq*QB + s*QB;
+
+            isum += (1 << 0)*(__builtin_popcountll(m0[0] & m1[0]));
+            isum += (1 << 1)*(__builtin_popcountll(m0[0] & m1[1]) + __builtin_popcountll(m0[1] & m1[0]));
+            isum += (1 << 2)*(__builtin_popcountll(m0[0] & m1[2]) + __builtin_popcountll(m0[1] & m1[1]) + __builtin_popcountll(m0[2] & m1[0]));
+            isum += (1 << 3)*(__builtin_popcountll(m0[0] & m1[3]) + __builtin_popcountll(m0[1] & m1[2]) + __builtin_popcountll(m0[2] & m1[1]) + __builtin_popcountll(m0[3] & m1[0]));
+            isum += (1 << 4)*(__builtin_popcountll(m0[1] & m1[3]) + __builtin_popcountll(m0[2] & m1[2]) + __builtin_popcountll(m0[3] & m1[1]));
+            isum += (1 << 5)*(__builtin_popcountll(m0[2] & m1[3]) + __builtin_popcountll(m0[3] & m1[2]));
+            isum += (1 << 6)*(__builtin_popcountll(m0[3] & m1[3]));
+        }
+#else
+        for (int s = 0; s < nq; ++s) {
+            for (int q0 = 0; q0 < QB; q0++) {
+                const gq_quant_t mm0 = pb0[i*nq*QB + s*QB + q0];
+                for (int q1 = 0; q1 < QB; q1++) {
+                    const gq_quant_t mm1 = pb1[i*nq*QB + s*QB + q1];
+                    isum += (1 << (q0 + q1))*(__builtin_popcountll(mm0 & mm1));
+                }
+            }
+        }
+#endif
+
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        sumf += d0*d1*isum;
+    }
+#else
+#ifdef __ARM_NEON
+    // gq_quant_t == uint64_t
+    for (int i = 0; i < nb; i += 4) {
+        int isum[4] = {0, 0, 0, 0};
+
+        for (int k = 0; k < 4; ++k) {
+            for (int s = 0; s < nq; ++s) {
+                const gq_quant_t * restrict m0 = pb0 + (i+k)*nq*QB + s*QB;
+                const gq_quant_t * restrict m1 = pb1 + (i+k)*nq*QB + s*QB;
+
+#if QB == 4
+#define bpcnt(x) __builtin_popcountll(x)
+                //isum[k] += (1ULL << 0)*(bpcnt(m0[0] & m1[0])) +
+                //           (1ULL << 1)*(bpcnt(m0[0] & m1[1]) + bpcnt(m0[1] & m1[0])) +
+                //           (1ULL << 2)*(bpcnt(m0[0] & m1[2]) + bpcnt(m0[1] & m1[1]) + bpcnt(m0[2] & m1[0])) +
+                //           (1ULL << 3)*(bpcnt(m0[0] & m1[3]) + bpcnt(m0[1] & m1[2]) + bpcnt(m0[2] & m1[1]) + bpcnt(m0[3] & m1[0])) +
+                //           (1ULL << 4)*(bpcnt(m0[1] & m1[3]) + bpcnt(m0[2] & m1[2]) + bpcnt(m0[3] & m1[1])) +
+                //           (1ULL << 5)*(bpcnt(m0[2] & m1[3]) + bpcnt(m0[3] & m1[2])) +
+                //           (1ULL << 6)*(bpcnt(m0[3] & m1[3]));
+#undef bpcnt
+
+                const uint8x8_t m00 = vld1_u8((const uint8_t *) (m0 + 0));
+                const uint8x8_t m01 = vld1_u8((const uint8_t *) (m0 + 1));
+                const uint8x8_t m02 = vld1_u8((const uint8_t *) (m0 + 2));
+                const uint8x8_t m03 = vld1_u8((const uint8_t *) (m0 + 3));
+
+                const uint8x8_t m10 = vld1_u8((const uint8_t *) (m1 + 0));
+                const uint8x8_t m11 = vld1_u8((const uint8_t *) (m1 + 1));
+                const uint8x8_t m12 = vld1_u8((const uint8_t *) (m1 + 2));
+                const uint8x8_t m13 = vld1_u8((const uint8_t *) (m1 + 3));
+
+                const uint8x8_t m00m10 = vand_u8(m00, m10);
+
+                const uint8x8_t m00m11 = vand_u8(m00, m11);
+                const uint8x8_t m01m10 = vand_u8(m01, m10);
+
+                const uint8x8_t m00m12 = vand_u8(m00, m12);
+                const uint8x8_t m01m11 = vand_u8(m01, m11);
+                const uint8x8_t m02m10 = vand_u8(m02, m10);
+
+                const uint8x8_t m00m13 = vand_u8(m00, m13);
+                const uint8x8_t m01m12 = vand_u8(m01, m12);
+                const uint8x8_t m02m11 = vand_u8(m02, m11);
+                const uint8x8_t m03m10 = vand_u8(m03, m10);
+
+                const uint8x8_t m01m13 = vand_u8(m01, m13);
+                const uint8x8_t m02m12 = vand_u8(m02, m12);
+                const uint8x8_t m03m11 = vand_u8(m03, m11);
+
+                const uint8x8_t m02m13 = vand_u8(m02, m13);
+                const uint8x8_t m03m12 = vand_u8(m03, m12);
+
+                const uint8x8_t m03m13 = vand_u8(m03, m13);
+
+#define bpcnt(x) vaddv_u8(vcnt_u8(x))
+                isum[k] += (1ULL << 0)*(bpcnt(m00m10)) +
+                           (1ULL << 1)*(bpcnt(m00m11) + bpcnt(m01m10)) +
+                           (1ULL << 2)*(bpcnt(m00m12) + bpcnt(m01m11) + bpcnt(m02m10)) +
+                           (1ULL << 3)*(bpcnt(m00m13) + bpcnt(m01m12) + bpcnt(m02m11) + bpcnt(m03m10)) +
+                           (1ULL << 4)*(bpcnt(m01m13) + bpcnt(m02m12) + bpcnt(m03m11)) +
+                           (1ULL << 5)*(bpcnt(m02m13) + bpcnt(m03m12)) +
+                           (1ULL << 6)*(bpcnt(m03m13));
+#undef bpcnt
+#else
+                for (int q0 = 0; q0 < QB; q0++) {
+                    const gq_quant_t mm0 = m0[q0];
+                    for (int q1 = 0; q1 < QB; q1++) {
+                        const gq_quant_t mm1 = m1[q1];
+                        isum[k] += (1ULL << (q0 + q1))*(__builtin_popcountll(mm0 & mm1));
+                    }
+                }
+#endif
+            }
+        }
+
+        int32x4_t isumv = vld1q_s32(isum);
+
+        float32x4_t d0v = vld1q_f32(pd0 + i);
+        float32x4_t d1v = vld1q_f32(pd1 + i);
+
+        float32x4_t sumfv = vmulq_f32(d0v, d1v);
+
+        sumfv = vmulq_f32(sumfv, vcvtq_f32_s32(isumv));
+        sumf += vaddvq_f32(sumfv);
+    }
+#else
+#error "not implemented"
+#endif
+
+#endif
+    *s = sumf;
+}
+
+// use vec_dot_gq_3 to compute the dot product of two rows
+void mul_mat_gq_3(
+    const void * src0,
+    const void * src1, // transposed
+         float * dst,
+    int m, int n, int k) {
+    assert(k % QK == 0);
+
+    const int nb = quantize_3_blocks_per_row(k);
+    const int nq = quantize_3_quants_per_block();
+
+    for (int ir0 = 0; ir0 < m; ir0++) {
+        for (int ir1 = 0; ir1 < n; ir1++) {
+            vec_dot_gq_3(k, dst + ir1, src0, src1);
+            src1 = (const char *) src1 + quantize_3_row_size(k);
+        }
+        src0 = (const char *) src0 +   quantize_3_row_size(k);
+        src1 = (const char *) src1 - n*quantize_3_row_size(k);
+
+        dst = (float *) dst + n;
+    }
+}
+
+//
+// method 4
+// 4-bit quantization
+//
+
+static inline int quantize_4_blocks_per_row(int k) {
+    return k/QK;
+}
+
+static inline int quantize_4_row_size(int k) {
+    const int nb = quantize_4_blocks_per_row(k);
+
+    return nb*(2*sizeof(gq_scale_t) + QK/2);
+}
+
+void quantize_4_row(const float * restrict src, void * restrict dst, int k) {
+    assert(k % QK == 0);
+    assert(QB == 4);
+
+    const int nb = quantize_4_blocks_per_row(k);
+
+    gq_scale_t * restrict pm = (gq_scale_t *) (dst);
+    gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
+    uint8_t    * restrict pb = (uint8_t *)    (pd + nb);
+
+    uint8_t pp[QK/2];
+
+    for (int i = 0; i < nb; i++) {
+        memset(pp, 0, sizeof(pp));
+
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+#if defined(__AVX2__)
+        {
+            assert(QK == 64);
+            enum { QK8 = QK/8 };
+
+            __m256 srcv[QK8];
+            __m256 minv[QK8];
+            __m256 maxv[QK8];
+
+            for (int l = 0; l < QK8; l++) {
+                srcv[l] = _mm256_loadu_ps(src + i*QK + 8*l);
+            }
+
+            for (int l = 0; l < QK8/2; l++) {
+                minv[2*l] = _mm256_min_ps(srcv[2*l], srcv[2*l+1]);
+                maxv[2*l] = _mm256_max_ps(srcv[2*l], srcv[2*l+1]);
+            }
+
+            for (int l = 0; l < QK8/4; l++) {
+                minv[4*l] = _mm256_min_ps(minv[4*l], minv[4*l+2]);
+                maxv[4*l] = _mm256_max_ps(maxv[4*l], maxv[4*l+2]);
+            }
+
+            for (int l = 0; l < QK8/8; l++) {
+                minv[8*l] = _mm256_min_ps(minv[8*l], minv[8*l+4]);
+                maxv[8*l] = _mm256_max_ps(maxv[8*l], maxv[8*l+4]);
+            }
+
+            //min = MIN(minv[0][0], MIN(minv[0][1], MIN(minv[0][2], MIN(minv[0][3], MIN(minv[0][4], MIN(minv[0][5], MIN(minv[0][6], minv[0][7])))))));
+            //max = MAX(maxv[0][0], MAX(maxv[0][1], MAX(maxv[0][2], MAX(maxv[0][3], MAX(maxv[0][4], MAX(maxv[0][5], MAX(maxv[0][6], maxv[0][7])))))));
+
+            const __m256 minv0_0 = _mm256_permute2f128_ps(minv[0], minv[0], 3);
+            const __m256 minv0_1 = _mm256_min_ps(minv[0], minv0_0);
+            const __m256 minv0_2 = _mm256_permute_ps(minv0_1, 0x4e);
+            const __m256 minv0_3 = _mm256_min_ps(minv0_1, minv0_2);
+            const __m256 minv0_4 = _mm256_permute_ps(minv0_3, 0xb1);
+            const __m256 minv0_5 = _mm256_min_ps(minv0_3, minv0_4);
+
+            const __m256 maxv0_0 = _mm256_permute2f128_ps(maxv[0], maxv[0], 3);
+            const __m256 maxv0_1 = _mm256_max_ps(maxv[0], maxv0_0);
+            const __m256 maxv0_2 = _mm256_permute_ps(maxv0_1, 0x4e);
+            const __m256 maxv0_3 = _mm256_max_ps(maxv0_1, maxv0_2);
+            const __m256 maxv0_4 = _mm256_permute_ps(maxv0_3, 0xb1);
+            const __m256 maxv0_5 = _mm256_max_ps(maxv0_3, maxv0_4);
+
+            min = _mm256_cvtss_f32(minv0_5);
+            max = _mm256_cvtss_f32(maxv0_5);
+
+            const float d = (max - min) / ((1 << QB) - 2);
+            const float id = d ? 1.0/d : 0.0;
+
+            pm[i] = GGML_FP32_TO_GQ(min);
+            pd[i] = GGML_FP32_TO_GQ(d);
+
+            const __m256 idv = _mm256_set1_ps(id);
+
+            for (int l = 0; l < QK/8; l++) {
+                __m256 v = _mm256_mul_ps(_mm256_sub_ps(srcv[l], _mm256_set1_ps(min)), idv);
+#if 0
+                v[0] += frand(); v[1] += frand(); v[2] += frand(); v[3] += frand();
+                v[4] += frand(); v[5] += frand(); v[6] += frand(); v[7] += frand();
+#endif
+
+                // convert to uint8
+                __m256i vi = _mm256_cvtps_epi32(v);
+
+                uint32_t vi_0 = _mm256_extract_epi32(vi, 0);
+                uint32_t vi_1 = _mm256_extract_epi32(vi, 1);
+                uint32_t vi_2 = _mm256_extract_epi32(vi, 2);
+                uint32_t vi_3 = _mm256_extract_epi32(vi, 3);
+
+                uint32_t vi_4 = _mm256_extract_epi32(vi, 4);
+                uint32_t vi_5 = _mm256_extract_epi32(vi, 5);
+                uint32_t vi_6 = _mm256_extract_epi32(vi, 6);
+                uint32_t vi_7 = _mm256_extract_epi32(vi, 7);
+
+                // convert to 4-bit, 2 consecutive packed into 1 byte
+                pp[4*l + 0] = vi_0 | (vi_1 << 4);
+                pp[4*l + 1] = vi_2 | (vi_3 << 4);
+                pp[4*l + 2] = vi_4 | (vi_5 << 4);
+                pp[4*l + 3] = vi_6 | (vi_7 << 4);
+
+                //printf("vi: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
+                //printf("v : %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+            }
+
+            memcpy(pb + i*QK/2, pp, sizeof(pp));
+        }
+#elif defined(__ARM_NEON) && 0
+        {
+            // TODO
+        }
+#else
+        {
+            for (int l = 0; l < QK; l++) {
+                const float v = src[i*QK + l];
+                if (v < min) min = v;
+                if (v > max) max = v;
+            }
+
+            const float d = (max - min) / ((1 << QB) - 1);
+            const float id = d ? 1.0/d : 0.0;
+
+            pm[i] = GGML_FP32_TO_GQ(min);
+            pd[i] = GGML_FP32_TO_GQ(d);
+
+            for (int l = 0; l < QK; l++) {
+                const float v = (src[i*QK + l] - min) * id;
+                const uint8_t vi = (uint8_t) (v + frand());
+                pp[l/2] |= (vi & 0xf) << (4*(l & 1));
+            }
+
+            memcpy(pb + i*QK/2, pp, sizeof(pp));
+        }
+#endif
+        //printf("min %f max %f\n", min, max);
+    }
+}
+
+// reimplementation of quantize_4 using quantize_4_row
+void quantize_4(const float * restrict src, char * restrict dst, int n, int k) {
+    assert(k % QK == 0);
+
+    for (int j = 0; j < n; j++) {
+        quantize_4_row(src + j*k, dst, k);
+        dst = (char *) dst + quantize_4_row_size(k);
+    }
+}
+
+void vec_dot_gq_4(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+    const int nb = quantize_4_blocks_per_row(n);
+
+    const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
+    const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
+
+    const gq_scale_t * restrict pd0 = pm0 + nb;
+    const gq_scale_t * restrict pd1 = pm1 + nb;
+
+    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
+    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
+
+    float sumf = 0.0;
+
+#if 0
+    // scalar
+    for (int i = 0; i < nb; i++) {
+        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+
+        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        for (int j = 0; j < QK/2; j++) {
+            const uint8_t v0 = p0[j];
+            const uint8_t v1 = p1[j];
+
+            const float f0 = d0*(v0 & 0xf) + m0;
+            const float f1 = d0*(v0 >> 4)  + m0;
+
+            const float f2 = d1*(v1 & 0xf) + m1;
+            const float f3 = d1*(v1 >> 4)  + m1;
+
+            sumf += f0*f2 + f1*f3;
+        }
+    }
+#else
+#if defined(__AVX2__)
+#if QK == 64 && 0
+    __m256 sumv0 = _mm256_setzero_ps();
+    __m256 sumv1 = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; i++) {
+        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+
+        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        const __m256 m0v = _mm256_set1_ps(m0);
+        const __m256 d0v = _mm256_set1_ps(d0);
+
+        const __m256 m1v = _mm256_set1_ps(m1);
+        const __m256 d1v = _mm256_set1_ps(d1);
+
+        const __m256i m4b = _mm256_set1_epi8(0xf);
+
+        __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
+
+        //_mm_prefetch((const char *) (p0 + 32), _MM_HINT_T0);
+        //_mm_prefetch((const char *) (p1 + 32), _MM_HINT_T0);
+        //_mm_prefetch((const char *) (pm0 + i + 1), _MM_HINT_T0);
+        //_mm_prefetch((const char *) (pm1 + i + 1), _MM_HINT_T0);
+        //_mm_prefetch((const char *) (pd0 + i + 1), _MM_HINT_T0);
+        //_mm_prefetch((const char *) (pd1 + i + 1), _MM_HINT_T0);
+
+        __m256i v00 = _mm256_and_si256(v0, _mm256_set1_epi32(0x000000FF));
+        __m256i v01 = _mm256_srli_epi32(_mm256_and_si256(v0, _mm256_set1_epi32(0x0000FFFF)), 8);
+        __m256i v02 = _mm256_srli_epi32(_mm256_and_si256(v0, _mm256_set1_epi32(0x00FFFFFF)), 16);
+        __m256i v03 = _mm256_srli_epi32(v0, 24);
+
+        //////////////////////
+
+        //{
+        //    uint32_t vi_0 = _mm256_extract_epi32(v00, 0);
+        //    uint32_t vi_1 = _mm256_extract_epi32(v00, 1);
+        //    uint32_t vi_2 = _mm256_extract_epi32(v00, 2);
+        //    uint32_t vi_3 = _mm256_extract_epi32(v00, 3);
+        //    uint32_t vi_4 = _mm256_extract_epi32(v00, 4);
+        //    uint32_t vi_5 = _mm256_extract_epi32(v00, 5);
+        //    uint32_t vi_6 = _mm256_extract_epi32(v00, 6);
+        //    uint32_t vi_7 = _mm256_extract_epi32(v00, 7);
+        //    printf("v0: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
+        //    printf("p0: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[0], p0[4], p0[8], p0[12], p0[16], p0[20], p0[24], p0[28]);
+        //    printf("p1: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[1], p0[5], p0[9], p0[13], p0[17], p0[21], p0[25], p0[29]);
+        //    printf("p2: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[2], p0[6], p0[10], p0[14], p0[18], p0[22], p0[26], p0[30]);
+        //    printf("p3: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[3], p0[7], p0[11], p0[15], p0[19], p0[23], p0[27], p0[31]);
+        //}
+
+        // compute 32 x 4-bit values (low and high)
+        __m256i v00l = _mm256_and_si256(v00, m4b);
+        __m256i v01l = _mm256_and_si256(v01, m4b);
+        __m256i v02l = _mm256_and_si256(v02, m4b);
+        __m256i v03l = _mm256_and_si256(v03, m4b);
+
+        __m256i v00h = _mm256_srli_epi32(v00, 4);
+        __m256i v01h = _mm256_srli_epi32(v01, 4);
+        __m256i v02h = _mm256_srli_epi32(v02, 4);
+        __m256i v03h = _mm256_srli_epi32(v03, 4);
+
+        //{
+        //    uint32_t vi_0 = _mm256_extract_epi32(v00l, 0);
+        //    uint32_t vi_1 = _mm256_extract_epi32(v00l, 1);
+        //    uint32_t vi_2 = _mm256_extract_epi32(v00l, 2);
+        //    uint32_t vi_3 = _mm256_extract_epi32(v00l, 3);
+        //    uint32_t vi_4 = _mm256_extract_epi32(v00l, 4);
+        //    uint32_t vi_5 = _mm256_extract_epi32(v00l, 5);
+        //    uint32_t vi_6 = _mm256_extract_epi32(v00l, 6);
+        //    uint32_t vi_7 = _mm256_extract_epi32(v00l, 7);
+
+        //    printf("v0l: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
+
+        //    vi_0 = _mm256_extract_epi32(v00h, 0);
+        //    vi_1 = _mm256_extract_epi32(v00h, 1);
+        //    vi_2 = _mm256_extract_epi32(v00h, 2);
+        //    vi_3 = _mm256_extract_epi32(v00h, 3);
+        //    vi_4 = _mm256_extract_epi32(v00h, 4);
+        //    vi_5 = _mm256_extract_epi32(v00h, 5);
+        //    vi_6 = _mm256_extract_epi32(v00h, 6);
+        //    vi_7 = _mm256_extract_epi32(v00h, 7);
+
+        //    printf("v0h: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
+        //}
+
+        // convert to float
+        __m256 vf00l = _mm256_cvtepi32_ps(v00l);
+        __m256 vf01l = _mm256_cvtepi32_ps(v01l);
+        __m256 vf02l = _mm256_cvtepi32_ps(v02l);
+        __m256 vf03l = _mm256_cvtepi32_ps(v03l);
+
+        __m256 vf00h = _mm256_cvtepi32_ps(v00h);
+        __m256 vf01h = _mm256_cvtepi32_ps(v01h);
+        __m256 vf02h = _mm256_cvtepi32_ps(v02h);
+        __m256 vf03h = _mm256_cvtepi32_ps(v03h);
+
+        //{
+        //    printf("vf00l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf00l[0], vf00l[1], vf00l[2], vf00l[3], vf00l[4], vf00l[5], vf00l[6], vf00l[7]);
+        //    printf("vf01l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf01l[0], vf01l[1], vf01l[2], vf01l[3], vf01l[4], vf01l[5], vf01l[6], vf01l[7]);
+        //    printf("vf02l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf02l[0], vf02l[1], vf02l[2], vf02l[3], vf02l[4], vf02l[5], vf02l[6], vf02l[7]);
+        //    printf("vf03l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf03l[0], vf03l[1], vf03l[2], vf03l[3], vf03l[4], vf03l[5], vf03l[6], vf03l[7]);
+        //}
+
+        // multiply by scale and add offset
+        vf00l = _mm256_fmadd_ps(vf00l, d0v, m0v);
+        vf01l = _mm256_fmadd_ps(vf01l, d0v, m0v);
+        vf02l = _mm256_fmadd_ps(vf02l, d0v, m0v);
+        vf03l = _mm256_fmadd_ps(vf03l, d0v, m0v);
+
+        vf00h = _mm256_fmadd_ps(vf00h, d0v, m0v);
+        vf01h = _mm256_fmadd_ps(vf01h, d0v, m0v);
+        vf02h = _mm256_fmadd_ps(vf02h, d0v, m0v);
+        vf03h = _mm256_fmadd_ps(vf03h, d0v, m0v);
+
+        __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
+
+        __m256i v10 = _mm256_and_si256(v1, _mm256_set1_epi32(0x000000FF));
+        __m256i v11 = _mm256_srli_epi32(_mm256_and_si256(v1, _mm256_set1_epi32(0x0000FFFF)), 8);
+        __m256i v12 = _mm256_srli_epi32(_mm256_and_si256(v1, _mm256_set1_epi32(0x00FFFFFF)), 16);
+        __m256i v13 = _mm256_srli_epi32(v1, 24);
+
+        __m256i v10l = _mm256_and_si256(v10, m4b);
+        __m256i v11l = _mm256_and_si256(v11, m4b);
+        __m256i v12l = _mm256_and_si256(v12, m4b);
+        __m256i v13l = _mm256_and_si256(v13, m4b);
+
+        __m256i v10h = _mm256_srli_epi32(v10, 4);
+        __m256i v11h = _mm256_srli_epi32(v11, 4);
+        __m256i v12h = _mm256_srli_epi32(v12, 4);
+        __m256i v13h = _mm256_srli_epi32(v13, 4);
+
+        __m256 vf10l = _mm256_cvtepi32_ps(v10l);
+        __m256 vf11l = _mm256_cvtepi32_ps(v11l);
+        __m256 vf12l = _mm256_cvtepi32_ps(v12l);
+        __m256 vf13l = _mm256_cvtepi32_ps(v13l);
+
+        __m256 vf10h = _mm256_cvtepi32_ps(v10h);
+        __m256 vf11h = _mm256_cvtepi32_ps(v11h);
+        __m256 vf12h = _mm256_cvtepi32_ps(v12h);
+        __m256 vf13h = _mm256_cvtepi32_ps(v13h);
+
+        vf10l = _mm256_fmadd_ps(vf10l, d1v, m1v);
+        vf11l = _mm256_fmadd_ps(vf11l, d1v, m1v);
+        vf12l = _mm256_fmadd_ps(vf12l, d1v, m1v);
+        vf13l = _mm256_fmadd_ps(vf13l, d1v, m1v);
+
+        vf10h = _mm256_fmadd_ps(vf10h, d1v, m1v);
+        vf11h = _mm256_fmadd_ps(vf11h, d1v, m1v);
+        vf12h = _mm256_fmadd_ps(vf12h, d1v, m1v);
+        vf13h = _mm256_fmadd_ps(vf13h, d1v, m1v);
+
+        // compute dot product
+        sumv0 = _mm256_fmadd_ps(vf00l, vf10l, sumv0);
+        sumv0 = _mm256_fmadd_ps(vf01l, vf11l, sumv0);
+        sumv0 = _mm256_fmadd_ps(vf02l, vf12l, sumv0);
+        sumv0 = _mm256_fmadd_ps(vf03l, vf13l, sumv0);
+
+        sumv1 = _mm256_fmadd_ps(vf00h, vf10h, sumv1);
+        sumv1 = _mm256_fmadd_ps(vf01h, vf11h, sumv1);
+        sumv1 = _mm256_fmadd_ps(vf02h, vf12h, sumv1);
+        sumv1 = _mm256_fmadd_ps(vf03h, vf13h, sumv1);
+    }
+
+    // accumulate (horizontal sum)
+    const __m256 vdot = _mm256_add_ps(sumv0, sumv1);
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(vdot), _mm256_extractf128_ps(vdot, 1));
+    const __m128 t1 = _mm_hadd_ps(t0, t0);
+
+    sumf += _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
+#elif QK == 64 && 0
+    float sum00 = 0.0f;
+    float sum01 = 0.0f;
+    float sum10 = 0.0f;
+    float sum11 = 0.0f;
+
+    const __m256i m4b = _mm256_set1_epi8(0xf);
+
+    for (int i = 0; i < nb; i++) {
+        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+
+        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        // 64 x 4
+        const __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
+        const __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
+
+        // 32 x 8
+        const __m256i v0l = _mm256_and_si256(v0, m4b);
+        const __m256i v1l = _mm256_and_si256(v1, m4b);
+
+        const __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b);
+        const __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b);
+
+        const __m256i pl = _mm256_maddubs_epi16(v0l, v1l);
+        const __m256i ph = _mm256_maddubs_epi16(v0h, v1h);
+
+        const __m256i p16 = _mm256_add_epi16(ph, pl);
+        const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16);
+
+        sum00 += m0*m1;
+        sum01 += m1*d0*(_mm256_hadd_epi8_gg(_mm256_add_epi8(v0l, v0h)));
+        sum10 += m0*d1*(_mm256_hadd_epi8_gg(_mm256_add_epi8(v1l, v1h)));
+        sum11 += d0*d1*(_mm256_hadd_epi32_gg(p));
+    }
+
+    sumf = 64.0*sum00 + sum01 + sum10 + sum11;
+#elif QK == 64 && 1 // this is the best when using min + d
+    float sum00 = 0.0f;
+
+    __m256 sum01 = _mm256_setzero_ps();
+    __m256 sum10 = _mm256_setzero_ps();
+    __m256 sum11 = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; i++) {
+        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+
+        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        const __m256 m0v = _mm256_set1_ps(m0);
+        const __m256 d0v = _mm256_set1_ps(d0);
+
+        const __m256 m1v = _mm256_set1_ps(m1);
+        const __m256 d1v = _mm256_set1_ps(d1);
+
+        const __m256 m1d0v = _mm256_mul_ps(m1v, d0v);
+        const __m256 m0d1v = _mm256_mul_ps(m0v, d1v);
+        const __m256 d0d1v = _mm256_mul_ps(d0v, d1v);
+
+        const __m256i m4b = _mm256_set1_epi8(0xf);
+
+        // 64 x 4
+        const __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
+        const __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
+
+        // 32 x 8
+        const __m256i v0l = _mm256_and_si256(v0, m4b);
+        const __m256i v1l = _mm256_and_si256(v1, m4b);
+
+        const __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b);
+        const __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b);
+
+        const __m256i v0a = _mm256_add_epi8(v0l, v0h);
+        const __m256i v1a = _mm256_add_epi8(v1l, v1h);
+
+        const __m128i v0al = _mm256_extracti128_si256(v0a, 0);
+        const __m128i v0ah = _mm256_extracti128_si256(v0a, 1);
+
+        const __m128i v1al = _mm256_extracti128_si256(v1a, 0);
+        const __m128i v1ah = _mm256_extracti128_si256(v1a, 1);
+
+        const __m128i v0as = _mm_add_epi8(v0al, v0ah);
+        const __m128i v1as = _mm_add_epi8(v1al, v1ah);
+
+        const __m256i v0as_0 = _mm256_cvtepu8_epi32(v0as);
+        const __m256i v0as_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(v0as, 8));
+
+        const __m256i v1as_0 = _mm256_cvtepu8_epi32(v1as);
+        const __m256i v1as_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(v1as, 8));
+
+        const __m256i v0ass = _mm256_add_epi32(v0as_0, v0as_1);
+        const __m256i v1ass = _mm256_add_epi32(v1as_0, v1as_1);
+
+        const __m256 v0f = _mm256_cvtepi32_ps(v0ass);
+        const __m256 v1f = _mm256_cvtepi32_ps(v1ass);
+
+        const __m256i pl = _mm256_maddubs_epi16(v0l, v1l);
+        const __m256i ph = _mm256_maddubs_epi16(v0h, v1h);
+
+        const __m256i p16 = _mm256_add_epi16(ph, pl);
+        const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16);
+
+        sum00 += m0*m1;
+        sum01 = _mm256_fmadd_ps(m1d0v, v0f, sum01);
+        sum10 = _mm256_fmadd_ps(m0d1v, v1f, sum10);
+        sum11 = _mm256_fmadd_ps(d0d1v, _mm256_cvtepi32_ps(p), sum11);
+    }
+
+    sumf = 64.0*sum00 + _mm256_hadd_ps_gg(sum01) + _mm256_hadd_ps_gg(sum10) + _mm256_hadd_ps_gg(sum11);
+#endif
+#elif defined (__ARM_NEON)
+    float sum00 = 0.0f;
+    float sum01 = 0.0f;
+    float sum10 = 0.0f;
+    float sum11 = 0.0f;
+
+    for (int i = 0; i < nb; i++) {
+        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+
+        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        const uint8x16_t m4b = vdupq_n_u8(0xf);
+
+        const uint8x16_t v0_0 = vld1q_u8(p0);
+        const uint8x16_t v0_1 = vld1q_u8(p0 + 16);
+        const uint8x16_t v1_0 = vld1q_u8(p1);
+        const uint8x16_t v1_1 = vld1q_u8(p1 + 16);
+
+        // and with 0xf
+        const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
+        const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
+        const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
+        const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
+
+        const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
+        const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
+        const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
+        const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
+
+        // dot product into uint16x8_t
+        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
+        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
+        const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
+        const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
+
+        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
+        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
+        const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
+        const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
+
+        const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
+        const uint16x8_t pl1 = vaddq_u16(pl1l, pl1h);
+        const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
+        const uint16x8_t ph1 = vaddq_u16(ph1l, ph1h);
+
+        const uint16x8_t pl = vaddq_u16(pl0, pl1);
+        const uint16x8_t ph = vaddq_u16(ph0, ph1);
+
+        sum00 += m0*m1;
+        sum01 += m1*d0*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h) + vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
+        sum10 += m0*d1*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h) + vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
+        //sum11 += d0*d1*(
+        //        vaddvq_u16(vaddq_u16(vaddq_u16(pl0l, pl0h), vaddq_u16(pl1l, pl1h))) +
+        //        vaddvq_u16(vaddq_u16(vaddq_u16(ph0l, ph0h), vaddq_u16(ph1l, ph1h))));
+        sum11 += d0*d1*vaddvq_u16(vaddq_u16(pl, ph));
+    }
+
+    sumf = 64.0*sum00 + sum01 + sum10 + sum11;
+#endif
+#endif
+
+    *s = sumf;
+}
+
+// use vec_dot_gq_4 to compute the dot product of two rows
+void mul_mat_gq_4(
+    const void * src0,
+    const void * src1, // transposed
+         float * dst,
+    int m, int n, int k) {
+    assert(k % QK == 0);
+
+    const int nb = quantize_4_blocks_per_row(k);
+
+    for (int ir0 = 0; ir0 < m; ir0++) {
+        for (int ir1 = 0; ir1 < n; ir1++) {
+            vec_dot_gq_4(k, dst + ir1, src0, src1);
+            src1 = (const char *) src1 + quantize_4_row_size(k);
+        }
+        src0 = (const char *) src0 +   quantize_4_row_size(k);
+        src1 = (const char *) src1 - n*quantize_4_row_size(k);
+
+        dst = (float *) dst + n;
+    }
+}
+
+//
+// method 5
+// 4-bit quantization (without min, only delta)
+//
+
+static inline int quantize_5_blocks_per_row(int k) {
+    return k/QK;
+}
+
+static inline int quantize_5_row_size(int k) {
+    const int nb = quantize_5_blocks_per_row(k);
+
+    return nb*(sizeof(gq_scale_t) + QK/2);
+}
+
+void quantize_5_row(const float * restrict src, void * restrict dst, int k) {
+    assert(k % QK == 0);
+    assert(QB == 4);
+
+    const int nb = quantize_5_blocks_per_row(k);
+
+    gq_scale_t * restrict pd = (gq_scale_t *) (dst);
+    uint8_t    * restrict pb = (uint8_t *)    (pd + nb);
+
+    uint8_t pp[QK/2];
+
+    for (int i = 0; i < nb; i++) {
+        memset(pp, 0, sizeof(pp));
+
+        float amax = 0.0f; // absolute max
+
+#if defined(__AVX2__)
+        {
+            assert(QK == 64);
+            enum { QK8 = QK/8 };
+
+            __m256 srcv [QK8];
+            __m256 asrcv[QK8];
+            __m256 amaxv[QK8];
+
+            for (int l = 0; l < QK8; l++) {
+                srcv[l]  = _mm256_loadu_ps(src + i*QK + 8*l);
+            }
+
+            for (int l = 0; l < QK8; l++) {
+                asrcv[l] = _mm256_and_ps(srcv[l], _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
+            }
+
+
+            for (int l = 0; l < QK8/2; l++) {
+                amaxv[2*l] = _mm256_max_ps(asrcv[2*l], asrcv[2*l+1]);
+            }
+
+            for (int l = 0; l < QK8/4; l++) {
+                amaxv[4*l] = _mm256_max_ps(amaxv[4*l], amaxv[4*l+2]);
+            }
+
+            for (int l = 0; l < QK8/8; l++) {
+                amaxv[8*l] = _mm256_max_ps(amaxv[8*l], amaxv[8*l+4]);
+            }
+
+            //amax = MAX(amaxv[0][0], MAX(amaxv[0][1], MAX(amaxv[0][2], MAX(amaxv[0][3], MAX(amaxv[0][4], MAX(amaxv[0][5], MAX(amaxv[0][6], amaxv[0][7])))))));
+
+            const __m256 amaxv0_0 = _mm256_permute2f128_ps(amaxv[0], amaxv[0], 3);
+            const __m256 amaxv0_1 = _mm256_max_ps(amaxv[0], amaxv0_0);
+            const __m256 amaxv0_2 = _mm256_permute_ps(amaxv0_1, 0x4e);
+            const __m256 amaxv0_3 = _mm256_max_ps(amaxv0_1, amaxv0_2);
+            const __m256 amaxv0_4 = _mm256_permute_ps(amaxv0_3, 0xb1);
+            const __m256 amaxv0_5 = _mm256_max_ps(amaxv0_3, amaxv0_4);
+
+            amax = _mm256_cvtss_f32(amaxv0_5);
+
+            //printf("amax = %f\n", amax);
+
+            const float d = amax / ((1 << (QB - 1)) - 1);
+            const float id = d ? 1.0/d : 0.0;
+
+            pd[i] = GGML_FP32_TO_GQ(d);
+
+            const __m256 idv = _mm256_set1_ps(id);
+
+            for (int l = 0; l < QK/8; l++) {
+                __m256 v = _mm256_mul_ps(srcv[l], idv);
+#if 0
+                v[0] += frand(); v[1] += frand(); v[2] += frand(); v[3] += frand();
+                v[4] += frand(); v[5] += frand(); v[6] += frand(); v[7] += frand();
+#endif
+
+                // convert to int8
+                __m256i vi = _mm256_cvtps_epi32(v);
+                vi = _mm256_add_epi32(vi, _mm256_set1_epi32(8));
+
+                int32_t vi_0 = _mm256_extract_epi32(vi, 0);
+                int32_t vi_1 = _mm256_extract_epi32(vi, 1);
+                int32_t vi_2 = _mm256_extract_epi32(vi, 2);
+                int32_t vi_3 = _mm256_extract_epi32(vi, 3);
+
+                int32_t vi_4 = _mm256_extract_epi32(vi, 4);
+                int32_t vi_5 = _mm256_extract_epi32(vi, 5);
+                int32_t vi_6 = _mm256_extract_epi32(vi, 6);
+                int32_t vi_7 = _mm256_extract_epi32(vi, 7);
+
+                // convert to 4-bit, 2 consecutive packed into 1 byte
+                pp[4*l + 0] = vi_0 | (vi_1 << 4);
+                pp[4*l + 1] = vi_2 | (vi_3 << 4);
+                pp[4*l + 2] = vi_4 | (vi_5 << 4);
+                pp[4*l + 3] = vi_6 | (vi_7 << 4);
+
+                //printf("vi: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
+                ////printf("v : %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+
+                assert(vi_0 >= 0 && vi_0 < 16);
+                assert(vi_1 >= 0 && vi_1 < 16);
+                assert(vi_2 >= 0 && vi_2 < 16);
+                assert(vi_3 >= 0 && vi_3 < 16);
+
+                assert(vi_4 >= 0 && vi_4 < 16);
+                assert(vi_5 >= 0 && vi_5 < 16);
+                assert(vi_6 >= 0 && vi_6 < 16);
+                assert(vi_7 >= 0 && vi_7 < 16);
+            }
+
+            memcpy(pb + i*QK/2, pp, sizeof(pp));
+        }
+#elif defined(__ARM_NEON) && 0
+        {
+            // TODO
+        }
+#else
+        {
+            for (int l = 0; l < QK; l++) {
+                const float v = src[i*QK + l];
+                amax = MAX(amax, fabsf(v));
+            }
+
+            const float d = amax / ((1 << (QB - 1)) - 1);
+            const float id = d ? 1.0/d : 0.0;
+
+            pd[i] = GGML_FP32_TO_GQ(d);
+
+            for (int l = 0; l < QK; l++) {
+                const float v = src[i*QK + l]*id;
+                const int8_t vi = ((int8_t) (round(v))) + 8;
+                assert(vi >= 0 && vi < 16);
+                pp[l/2] |= (vi & 0xf) << (4*(l & 1));
+            }
+
+            memcpy(pb + i*QK/2, pp, sizeof(pp));
+        }
+#endif
+        //printf("min %f max %f\n", min, max);
+    }
+}
+
+// reimplementation of quantize_5 using quantize_5_row
+void quantize_5(const float * restrict src, char * restrict dst, int n, int k) {
+    assert(k % QK == 0);
+
+    for (int j = 0; j < n; j++) {
+        quantize_5_row(src + j*k, dst, k);
+        dst = (char *) dst + quantize_5_row_size(k);
+    }
+}
+
+void vec_dot_gq_5(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+    const int nb = quantize_5_blocks_per_row(n);
+
+    const gq_scale_t * restrict pd0 = (const gq_scale_t *) x;
+    const gq_scale_t * restrict pd1 = (const gq_scale_t *) y;
+
+    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
+    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
+
+    float sumf = 0.0;
+
+#if 0
+    // scalar
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        for (int j = 0; j < QK/2; j++) {
+            const uint8_t v0 = p0[j];
+            const uint8_t v1 = p1[j];
+
+            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
+            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);
+
+            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
+            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);
+
+            sumf += f0*f2 + f1*f3;
+        }
+    }
+#else
+#if defined(__AVX2__)
+#if QK == 64 && 1
+    __m256 sum11 = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        const __m256 d0v = _mm256_set1_ps(d0);
+        const __m256 d1v = _mm256_set1_ps(d1);
+
+        const __m256 d0d1v = _mm256_mul_ps(d0v, d1v);
+
+        const __m256i m4b = _mm256_set1_epi8(0xf);
+
+        // 64 x 4
+        const __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
+        const __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
+
+        // 32 x 8
+        __m256i v0l = _mm256_and_si256(v0, m4b);
+        __m256i v1l = _mm256_and_si256(v1, m4b);
+
+        __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b);
+        __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b);
+
+        // sub 8
+        v0l = _mm256_sub_epi8(v0l, _mm256_set1_epi8(8));
+        v0h = _mm256_sub_epi8(v0h, _mm256_set1_epi8(8));
+
+        v1l = _mm256_sub_epi8(v1l, _mm256_set1_epi8(8));
+        v1h = _mm256_sub_epi8(v1h, _mm256_set1_epi8(8));
+
+        // abs
+        const __m256i v0la = _mm256_sign_epi8(v0l, v0l);
+        const __m256i v0ha = _mm256_sign_epi8(v0h, v0h);
+
+        // sign
+        const __m256i v1ls = _mm256_sign_epi8(v1l, v0l);
+        const __m256i v1hs = _mm256_sign_epi8(v1h, v0h);
+
+        const __m256i pl = _mm256_maddubs_epi16(v0la, v1ls);
+        const __m256i ph = _mm256_maddubs_epi16(v0ha, v1hs);
+
+        const __m256i p16 = _mm256_add_epi16(ph, pl);
+        const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16);
+
+        sum11 = _mm256_fmadd_ps(d0d1v, _mm256_cvtepi32_ps(p), sum11);
+    }
+
+    sumf = _mm256_hadd_ps_gg(sum11);
+#endif
+#elif defined (__ARM_NEON)
+    float sum11 = 0.0f;
+
+    //float32x4_t sum_0 = vdupq_n_f32(0.0f);
+    //float32x4_t sum_1 = vdupq_n_f32(0.0f);
+
+    //float16x8_t sum_0 = vdupq_n_f16(0.0f);
+    //float16x8_t sum_1 = vdupq_n_f16(0.0f);
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        //float32x4_t d0d1v = vdupq_n_f32(d0*d1);
+        //float16x8_t d0d1v = vdupq_n_f16(d0*d1);
+
+        const uint8_t * restrict p0 = pb0 + i*QK/2;
+        const uint8_t * restrict p1 = pb1 + i*QK/2;
+
+        const uint8x16_t m4b = vdupq_n_u8(0xf);
+        const int8x16_t s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(p0);
+        const uint8x16_t v0_1 = vld1q_u8(p0 + 16);
+        const uint8x16_t v1_0 = vld1q_u8(p1);
+        const uint8x16_t v1_1 = vld1q_u8(p1 + 16);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
+        const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
+        const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
+
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+        const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
+        const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
+        const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
+
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
+        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
+
+        // dot product into int16x8_t
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
+
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
+
+        const int16x8_t pl0 = vaddq_s16(pl0l, pl0h);
+        const int16x8_t pl1 = vaddq_s16(pl1l, pl1h);
+        const int16x8_t ph0 = vaddq_s16(ph0l, ph0h);
+        const int16x8_t ph1 = vaddq_s16(ph1l, ph1h);
+
+        const int16x8_t pl = vaddq_s16(pl0, pl1);
+        const int16x8_t ph = vaddq_s16(ph0, ph1);
+
+        //const int8x16_t pl0 = vmulq_s8(v0_0ls, v1_0ls);
+        //const int8x16_t pl1 = vmulq_s8(v0_1ls, v1_1ls);
+        //const int8x16_t ph0 = vmulq_s8(v0_0hs, v1_0hs);
+        //const int8x16_t ph1 = vmulq_s8(v0_1hs, v1_1hs);
+
+        //const int16x8_t pll = vaddl_s8(vget_low_s8(pl0),  vget_low_s8(pl1));
+        //const int16x8_t plh = vaddl_s8(vget_high_s8(pl0), vget_high_s8(pl1));
+        //const int16x8_t phl = vaddl_s8(vget_low_s8(ph0),  vget_low_s8(ph1));
+        //const int16x8_t phh = vaddl_s8(vget_high_s8(ph0), vget_high_s8(ph1));
+
+        //const int16x8_t pl = vaddq_s16(pll, plh);
+        //const int16x8_t ph = vaddq_s16(phl, phh);
+
+        const int16x8_t p = vaddq_s16(pl, ph);
+
+        // convert to float
+        //const float32x4_t pf0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (p)));
+        //const float32x4_t pf1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(p)));
+
+        // scalar
+        sum11 += d0*d1*vaddvq_s16(p);
+        //sum11 += d0*d1*(vaddvq_s16(pl) + vaddvq_s16(ph));
+        //sum11 += d0*d1*vaddvq_s16(vaddq_s16(pl, ph));
+        //sum11 += d0*d1*(vaddvq_s8(pl0) + vaddvq_s8(pl1) + vaddvq_s8(ph0) + vaddvq_s8(ph1));
+        //sum11 += d0*d1*(vaddvq_s16(pll) + vaddvq_s16(plh) + vaddvq_s16(phl) + vaddvq_s16(phh));
+
+        //sum_0 = vfmaq_f16(sum_0, d0d1v, vcvtq_f16_s16(p));
+        //sum_0 = vfmaq_f16(sum_0, d0d1v, vcvtq_f16_s16(pl));
+        //sum_1 = vfmaq_f16(sum_1, d0d1v, vcvtq_f16_s16(ph));
+
+        // vectorize
+        //sum_0 = vmlaq_f32(sum_0, d0d1v, pf0);
+        //sum_1 = vmlaq_f32(sum_1, d0d1v, pf1);
+    }
+
+    sumf = sum11;
+    //sumf = vaddvq_f32(sum_0) + vaddvq_f32(sum_1);
+    //sumf = sum_0[0] + sum_0[1] + sum_0[2] + sum_0[3] + sum_0[4] + sum_0[5] + sum_0[6] + sum_0[7];
+    //sum_0 = vaddq_f16(sum_0, sum_1);
+    //sumf = sum_0[0] + sum_0[1] + sum_0[2] + sum_0[3] + sum_0[4] + sum_0[5] + sum_0[6] + sum_0[7];
+#endif
+#endif
+
+    *s = sumf;
+}
+
+// use vec_dot_gq_5 to compute the dot product of two rows
+void mul_mat_gq_5(
+    const void * src0,
+    const void * src1, // transposed
+         float * dst,
+    int m, int n, int k) {
+    assert(k % QK == 0);
+
+    const int nb = quantize_5_blocks_per_row(k);
+
+    for (int ir0 = 0; ir0 < m; ir0++) {
+        for (int ir1 = 0; ir1 < n; ir1++) {
+            vec_dot_gq_5(k, dst + ir1, src0, src1);
+            src1 = (const char *) src1 + quantize_5_row_size(k);
+        }
+        src0 = (const char *) src0 +   quantize_5_row_size(k);
+        src1 = (const char *) src1 - n*quantize_5_row_size(k);
+
+        dst = (float *) dst + n;
+    }
+}
+
+//
+// method 6
+// same as 5 but with 32 element blocks
+//
+
+static inline int quantize_6_blocks_per_row(int k) {
+    return k/32;
+}
+
+static inline int quantize_6_row_size(int k) {
+    const int nb = quantize_6_blocks_per_row(k);
+
+    return nb*(sizeof(gq_scale_t) + 16);
+}
+
+void quantize_6_row(const float * restrict src, void * restrict dst, int k) {
+    assert(k % 32 == 0);
+    assert(QB == 4);
+
+    const int nb = quantize_6_blocks_per_row(k);
+
+    gq_scale_t * restrict pd = (gq_scale_t *) (dst);
+    uint8_t    * restrict pb = (uint8_t *)    (pd + nb);
+
+    uint8_t pp[16];
+
+    for (int i = 0; i < nb; i++) {
+        memset(pp, 0, sizeof(pp));
+
+        float amax = 0.0f; // absolute max
+
+#if defined(__AVX2__)
+        {
+            enum { QK8 = 4 };
+
+            __m256 srcv [QK8];
+            __m256 asrcv[QK8];
+            __m256 amaxv[QK8];
+
+            for (int l = 0; l < QK8; l++) {
+                srcv[l]  = _mm256_loadu_ps(src + i*32 + 8*l);
+            }
+
+            for (int l = 0; l < QK8; l++) {
+                asrcv[l] = _mm256_and_ps(srcv[l], _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
+            }
+
+            for (int l = 0; l < QK8/2; l++) {
+                amaxv[2*l] = _mm256_max_ps(asrcv[2*l], asrcv[2*l+1]);
+            }
+
+            for (int l = 0; l < QK8/4; l++) {
+                amaxv[4*l] = _mm256_max_ps(amaxv[4*l], amaxv[4*l+2]);
+            }
+
+            const __m256 amaxv0_0 = _mm256_permute2f128_ps(amaxv[0], amaxv[0], 3);
+            const __m256 amaxv0_1 = _mm256_max_ps(amaxv[0], amaxv0_0);
+            const __m256 amaxv0_2 = _mm256_permute_ps(amaxv0_1, 0x4e);
+            const __m256 amaxv0_3 = _mm256_max_ps(amaxv0_1, amaxv0_2);
+            const __m256 amaxv0_4 = _mm256_permute_ps(amaxv0_3, 0xb1);
+            const __m256 amaxv0_5 = _mm256_max_ps(amaxv0_3, amaxv0_4);
+
+            amax = _mm256_cvtss_f32(amaxv0_5);
+
+            const float d = amax / ((1 << (QB - 1)) - 1);
+            const float id = d ? 1.0/d : 0.0;
+
+            pd[i] = GGML_FP32_TO_GQ(d);
+
+            const __m256 idv = _mm256_set1_ps(id);
+
+            for (int l = 0; l < 4; l++) {
+                __m256 v = _mm256_mul_ps(srcv[l], idv);
+
+                // convert to int8
+                __m256i vi = _mm256_cvtps_epi32(v);
+                vi = _mm256_add_epi32(vi, _mm256_set1_epi32(8));
+
+                int32_t vi_0 = _mm256_extract_epi32(vi, 0);
+                int32_t vi_1 = _mm256_extract_epi32(vi, 1);
+                int32_t vi_2 = _mm256_extract_epi32(vi, 2);
+                int32_t vi_3 = _mm256_extract_epi32(vi, 3);
+
+                int32_t vi_4 = _mm256_extract_epi32(vi, 4);
+                int32_t vi_5 = _mm256_extract_epi32(vi, 5);
+                int32_t vi_6 = _mm256_extract_epi32(vi, 6);
+                int32_t vi_7 = _mm256_extract_epi32(vi, 7);
+
+                // convert to 4-bit, 2 consecutive packed into 1 byte
+                pp[4*l + 0] = vi_0 | (vi_1 << 4);
+                pp[4*l + 1] = vi_2 | (vi_3 << 4);
+                pp[4*l + 2] = vi_4 | (vi_5 << 4);
+                pp[4*l + 3] = vi_6 | (vi_7 << 4);
+
+                assert(vi_0 >= 0 && vi_0 < 16);
+                assert(vi_1 >= 0 && vi_1 < 16);
+                assert(vi_2 >= 0 && vi_2 < 16);
+                assert(vi_3 >= 0 && vi_3 < 16);
+
+                assert(vi_4 >= 0 && vi_4 < 16);
+                assert(vi_5 >= 0 && vi_5 < 16);
+                assert(vi_6 >= 0 && vi_6 < 16);
+                assert(vi_7 >= 0 && vi_7 < 16);
+            }
+
+            memcpy(pb + i*16, pp, sizeof(pp));
+        }
+#elif defined(__ARM_NEON)
+        {
+            float32x4_t srcv [8];
+            float32x4_t asrcv[8];
+            float32x4_t amaxv[8];
+
+            for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(src + i*32 + 4*l);
+            for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
+
+            for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
+            for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
+            for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+
+            amax = MAX(
+                    MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
+                    MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
+
+            const float d = amax / ((1 << 3) - 1);
+            const float id = d ? 1.0/d : 0.0;
+
+            pd[i] = GGML_FP32_TO_GQ(d);
+
+            for (int l = 0; l < 8; l++) {
+                const float32x4_t v = vmulq_n_f32(srcv[l], id);
+                const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
+                const int32x4_t vi = vcvtq_s32_f32(vf);
+
+                pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
+                pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
+            }
+
+            memcpy(pb + i*16, pp, sizeof(pp));
+        }
+#else
+        {
+            for (int l = 0; l < 32; l++) {
+                const float v = src[i*32 + l];
+                amax = MAX(amax, fabsf(v));
+            }
+
+            const float d = amax / ((1 << (QB - 1)) - 1);
+            const float id = d ? 1.0/d : 0.0;
+
+            pd[i] = GGML_FP32_TO_GQ(d);
+
+            for (int l = 0; l < 32; l++) {
+                const float v = src[i*32 + l]*id;
+                const int8_t vi = ((int8_t) (round(v))) + 8;
+                assert(vi >= 0 && vi < 16);
+                pp[l/2] |= (vi & 0xf) << (4*(l & 1));
+            }
+
+            memcpy(pb + i*16, pp, sizeof(pp));
+        }
+#endif
+        //printf("amax = %f\n", amax);
+    }
+}
+
+// reimplementation of quantize__6using quantize_6_row
+void quantize_6(const float * restrict src, char * restrict dst, int n, int k) {
+    assert(k % 32 == 0);
+
+    for (int j = 0; j < n; j++) {
+        quantize_6_row(src + j*k, dst, k);
+        dst = (char *) dst + quantize_6_row_size(k);
+    }
+}
+
+void vec_dot_gq_6(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+    const int nb = quantize_6_blocks_per_row(n);
+
+    const gq_scale_t * restrict pd0 = (const gq_scale_t *) x;
+    const gq_scale_t * restrict pd1 = (const gq_scale_t *) y;
+
+    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
+    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
+
+    float sumf = 0.0;
+
+#if 0
+    // scalar
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        const uint8_t * restrict p0 = pb0 + i*16;
+        const uint8_t * restrict p1 = pb1 + i*16;
+
+        for (int j = 0; j < 16; j++) {
+            const uint8_t v0 = p0[j];
+            const uint8_t v1 = p1[j];
+
+            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
+            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);
+
+            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
+            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);
+
+            sumf += f0*f2 + f1*f3;
+        }
+    }
+#else
+#if defined(__AVX2__)
+    // TODO
+#elif defined (__ARM_NEON)
+#if 0
+    float sum0 = 0.0f;
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
+        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
+
+        //float32x4_t d0d1v = vdupq_n_f32(d0*d1);
+        //float16x8_t d0d1v = vdupq_n_f16(d0*d1);
+
+        const uint8_t * restrict p0 = pb0 + i*16;
+        const uint8_t * restrict p1 = pb1 + i*16;
+
+        const uint8x16_t m4b = vdupq_n_u8(0xf);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(p0);
+        const uint8x16_t v1_0 = vld1q_u8(p1);
+
+        // 4-bit -> 8-bit
+        const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
+        const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
+
+        const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
+        const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
+
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
+
+        // dot product into int16x8_t
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
+
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
+
+        const int16x8_t pl = vaddq_s16(pl0l, pl0h);
+        const int16x8_t ph = vaddq_s16(ph0l, ph0h);
+
+        const int16x8_t p = vaddq_s16(pl, ph);
+
+        // scalar
+        sum0 += d0*d1*vaddvq_s16(p);
+    }
+
+    sumf = sum0;
+#elif 1 // this is a bit faster than the above
+    float sum0 = 0.0f;
+    float sum1 = 0.0f;
+
+    for (int i = 0; i < nb; i += 2) {
+        const float d0_0 = GGML_GQ_TO_FP32(pd0[i + 0]);
+        const float d1_0 = GGML_GQ_TO_FP32(pd1[i + 0]);
+        const float d0_1 = GGML_GQ_TO_FP32(pd0[i + 1]);
+        const float d1_1 = GGML_GQ_TO_FP32(pd1[i + 1]);
+
+        const uint8_t * restrict p0 = pb0 + i*16;
+        const uint8_t * restrict p1 = pb1 + i*16;
+
+        const uint8x16_t m4b = vdupq_n_u8(0xf);
+        const int8x16_t s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(p0);
+        const uint8x16_t v0_1 = vld1q_u8(p0 + 16);
+        const uint8x16_t v1_0 = vld1q_u8(p1);
+        const uint8x16_t v1_1 = vld1q_u8(p1 + 16);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
+        const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
+
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
+
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
+        const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
+
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+        const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
+
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
+
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
+
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
+
+        // dot product into int16x8_t
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
+
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
+
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
+
+        const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h);
+        const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h);
+
+        const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h);
+        const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h);
+
+        const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
+        const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
+
+        // scalar
+        sum0 += d0_0*d1_0*vaddvq_s16(p_0);
+        sum1 += d0_1*d1_1*vaddvq_s16(p_1);
+    }
+
+    sumf = sum0 + sum1;
+#endif
+#endif
+#endif
+
+    *s = sumf;
+}
+
+// use vec_dot_gq_6 to compute the dot product of two rows
+void mul_mat_gq_6(
+    const void * src0,
+    const void * src1, // transposed
+         float * dst,
+    int m, int n, int k) {
+    assert(k % 32 == 0);
+
+    for (int ir0 = 0; ir0 < m; ir0++) {
+        for (int ir1 = 0; ir1 < n; ir1++) {
+            vec_dot_gq_6(k, dst + ir1, src0, src1);
+            src1 = (const char *) src1 + quantize_6_row_size(k);
+        }
+        src0 = (const char *) src0 +   quantize_6_row_size(k);
+        src1 = (const char *) src1 - n*quantize_6_row_size(k);
+
+        dst = (float *) dst + n;
+    }
+}
+
+int main(int argc, const char ** argv) {
+    assert(sizeof(gq_quant_t)*8 == gq_t_bits);
+    ggml_time_init();
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    int method = 0;
+    if (argc > 1) {
+        method = atoi(argv[1]);
+    }
+
+    float * src0 = malloc(sizeof(float)*M*K);
+    float * src1 = malloc(sizeof(float)*N*K);
+    float * dst  = malloc(sizeof(float)*M*N);
+
+    // allocate aligned memory
+    //float * src0 = (float *)aligned_alloc(32, sizeof(float)*M*K);
+    //float * src1 = (float *)aligned_alloc(32, sizeof(float)*N*K);
+    //float * dst  = (float *)aligned_alloc(32, sizeof(float)*M*N);
+
+    for (int i = 0; i < M*K; i++) {
+        src0[i] = 0.8 - rand() / (float)RAND_MAX;
+        /*src0[i] = rand() / (float)RAND_MAX;*/
+        /*src0[i] = i % 2;*/
+    }
+
+    for (int i = 0; i < N*K; i++) {
+        src1[i] = 0.8 - rand() / (float)RAND_MAX;
+        /*src1[i] = rand() / (float)RAND_MAX;*/
+        /*src1[i] = i % 3;*/
+    }
+
+    void * src0_gq = NULL;
+    void * src1_gq = NULL;
+
+    size_t sizegq = 0;
+
+    {
+        if (method == 1) {
+            src0_gq = calloc(1, quantize_1_row_size(K)*M);
+            src1_gq = calloc(1, quantize_1_row_size(K)*N);
+
+            sizegq  = quantize_1_row_size(K)*M + quantize_1_row_size(K)*N;
+        }
+
+        if (method == 2) {
+            src0_gq = calloc(1, quantize_2_row_size(K)*M);
+            src1_gq = calloc(1, quantize_2_row_size(K)*N);
+
+            sizegq  = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N;
+        }
+
+        if (method == 3) {
+            src0_gq = calloc(1, quantize_3_row_size(K)*M);
+            src1_gq = calloc(1, quantize_3_row_size(K)*N);
+
+            sizegq  = quantize_3_row_size(K)*M + quantize_3_row_size(K)*N;
+        }
+
+        if (method == 4) {
+            src0_gq = calloc(1, quantize_4_row_size(K)*M);
+            src1_gq = calloc(1, quantize_4_row_size(K)*N);
+
+            sizegq  = quantize_4_row_size(K)*M + quantize_4_row_size(K)*N;
+        }
+
+        if (method == 5) {
+            src0_gq = calloc(1, quantize_5_row_size(K)*M);
+            src1_gq = calloc(1, quantize_5_row_size(K)*N);
+
+            sizegq  = quantize_5_row_size(K)*M + quantize_5_row_size(K)*N;
+        }
+
+        if (method == 6) {
+            src0_gq = calloc(1, quantize_6_row_size(K)*M);
+            src1_gq = calloc(1, quantize_6_row_size(K)*N);
+
+            sizegq  = quantize_6_row_size(K)*M + quantize_6_row_size(K)*N;
+        }
+    }
+
+    const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K;
+
+    printf("compression: %f\n", (float)sizegq/sizef16);
+
+    // convert fp32 -> gq
+    {
+        const int64_t t_start = ggml_time_us();
+
+        if (method == 1) {
+            quantize_1(src0, src0_gq, M, K);
+            quantize_1(src1, src1_gq, N, K);
+        }
+
+        if (method == 2) {
+            quantize_2(src0, src0_gq, M, K);
+            quantize_2(src1, src1_gq, N, K);
+        }
+
+        if (method == 3) {
+            quantize_3(src0, src0_gq, M, K);
+            quantize_3(src1, src1_gq, N, K);
+        }
+
+        if (method == 4) {
+            quantize_4(src0, src0_gq, M, K);
+            quantize_4(src1, src1_gq, N, K);
+        }
+
+        if (method == 5) {
+            quantize_5(src0, src0_gq, M, K);
+            quantize_5(src1, src1_gq, N, K);
+        }
+
+        if (method == 6) {
+            quantize_6(src0, src0_gq, M, K);
+            quantize_6(src1, src1_gq, N, K);
+        }
+
+        const int64_t t_end = ggml_time_us();
+        printf("convert time: %f ms / method = %d\n", (t_end - t_start) / 1000.0, method);
+    }
+
+    for (int i = 0; i < 16; ++i) {
+        printf("%f %f\n", src0[i], src1[i]);
+    }
+
+    const int nIter = 1;
+
+    const int64_t start = ggml_cycles();
+    const int64_t start_us = ggml_time_us();
+
+    double iM = 1.0/M;
+    double sum = 0.0f;
+    for (int i = 0; i < nIter; i++) {
+        if (method == 0) {
+            mul_mat_f32_naive(src0, src1, dst, M, N, K);
+        }
+
+        if (method == 1) {
+            mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K);
+        }
+
+        if (method == 2) {
+            mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K);
+        }
+
+        if (method == 3) {
+            mul_mat_gq_3(src0_gq, src1_gq, dst, M, N, K);
+        }
+
+        if (method == 4) {
+            mul_mat_gq_4(src0_gq, src1_gq, dst, M, N, K);
+        }
+
+        if (method == 5) {
+            mul_mat_gq_5(src0_gq, src1_gq, dst, M, N, K);
+        }
+
+        if (method == 6) {
+            mul_mat_gq_6(src0_gq, src1_gq, dst, M, N, K);
+        }
+    }
+
+    for (int i = 0; i < N; i++) {
+        sum += dst[i]*iM;
+    }
+
+    {
+        const int64_t end = ggml_cycles();
+        const int64_t end_us = ggml_time_us();
+        printf("%s: elapsed ticks: %" PRIu64 "\n",  __func__, end - start);
+        printf("%s: elapsed us:    %d / %f ms\n",  __func__, (int)(end_us - start_us), (end_us - start_us) / 1000.0 / nIter);
+    }
+
+#if 0
+    // print src0
+    printf("src0:\n");
+    for (int i = 0; i < M; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%4.1f ", src0[i*K+j]);
+        }
+        printf("\n");
+    }
+
+    // print src1
+    printf("src1:\n");
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < K; j++) {
+            printf("%4.1f ", src1[i*K+j]);
+        }
+        printf("\n");
+    }
+
+    printf("dst:\n");
+    for (int i = 0; i < M; i++) {
+        for (int j = 0; j < N; j++) {
+            printf("%4.1f ", dst[i*N+j]);
+        }
+        printf("\n");
+    }
+#endif
+
+    printf("%f\n", sum);
+
+    free(src0);
+    free(src1);
+    free(dst);
+
+    if (src0_gq) free(src0_gq);
+    if (src1_gq) free(src1_gq);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-opt.cpp b/stable-diffusion.cpp/ggml/tests/test-opt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb8af59620b146fc9c2e93d048cdb6a162c6937f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-opt.cpp
@@ -0,0 +1,180 @@
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+#define MAX_NARGS 2
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+//
+// logging
+//
+#define GGML_DEBUG 0
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+
+static float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+static struct ggml_tensor * get_random_tensor(
+    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
+) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    }
+
+    return result;
+}
+
+int main(void) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    int64_t ne1[4] = {4, 128, 1, 1};
+    int64_t ne2[4] = {4, 256, 1, 1};
+    int64_t ne3[4] = {128, 256, 1, 1};
+
+    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
+    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
+    ggml_set_param(ctx, a);
+    ggml_set_param(ctx, b);
+
+    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
+
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
+    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
+    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
+
+    struct ggml_cgraph ge = ggml_build_forward(e);
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
+    const float fe = ggml_get_f32_1d(e, 0);
+    printf("%s: e = %.4f\n", __func__, fe);
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
+
+    ggml_opt(ctx, opt_params, e);
+
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
+    const float fe_opt = ggml_get_f32_1d(e, 0);
+    printf("%s: original  e = %.4f\n", __func__, fe);
+    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
+
+    const bool success = (fe_opt <= fe);
+    assert(success);
+
+    ggml_free(ctx);
+    return success ? 0 : -1;
+}
+// int64_t ne1[4] = {4, 128, 1, 1};
+// int64_t ne2[4] = {4, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 25890.9375
+// main: optimized e = 10094.7031
+
+// int64_t ne1[4] = {8, 128, 1, 1};
+// int64_t ne2[4] = {8, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 39429.5078
+// main: optimized e = 9275.8936
+
+// int64_t ne1[4] = {16, 128, 1, 1};
+// int64_t ne2[4] = {16, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 68371.1328
+// main: optimized e = 7854.4502
+
+
+// int64_t ne1[4] = {32, 128, 1, 1};
+// int64_t ne2[4] = {32, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 126061.1953
+// main: optimized e = 5451.0166
+
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1620817.8750
+// main: optimized e = 698387.6875
+
+// another run on M1
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1629595.6250
+// main: optimized e = 698169.1250
+
+// int64_t ne1[4] = {32, 1024, 1, 1};
+// int64_t ne2[4] = {32, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 8146770.5000
+// main: optimized e = 651119.1250
diff --git a/stable-diffusion.cpp/ggml/tests/test-pool.c b/stable-diffusion.cpp/ggml/tests/test-pool.c
new file mode 100644
index 0000000000000000000000000000000000000000..cdf00f4ec989f129d38ddbdb41ebe6b040da260a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-pool.c
@@ -0,0 +1,143 @@
+#include "ggml/ggml.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct ggml_context* make_ctx(void) {
+    struct ggml_init_params params = {
+        .mem_size = 2 * 1024 * 1024,
+    };
+
+    return ggml_init(params);
+}
+
+int main(int argc, const char** argv) {
+
+    float buf_f32[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf_f32[i] = (float)(i + 1);
+    }
+
+    // avg pool 1d
+    {
+        struct ggml_context * ctx = make_ctx();
+        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+        struct ggml_tensor * t_pooled = ggml_pool_1d(ctx, t, GGML_OP_POOL_AVG, 3, 3, 0);
+        GGML_ASSERT(t_pooled->ne[0] == 3);
+        GGML_ASSERT(t_pooled->ne[1] == 2);
+        GGML_ASSERT(t_pooled->ne[2] == 1);
+
+        struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+        ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+        const float * output = ggml_get_data_f32(t_pooled);
+
+        GGML_ASSERT(output[0] == 2);
+        GGML_ASSERT(output[1] == 5);
+        GGML_ASSERT(output[2] == 8);
+        GGML_ASSERT(output[3] == 12);
+        GGML_ASSERT(output[4] == 15);
+        GGML_ASSERT(output[5] == 18);
+
+        ggml_free(ctx);
+    }
+
+    // max pool 1d
+    {
+        struct ggml_context * ctx = make_ctx();
+        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+        memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+        struct ggml_tensor * t_pooled = ggml_pool_1d(ctx, t, GGML_OP_POOL_MAX, 3, 3, 0);
+        GGML_ASSERT(t_pooled->ne[0] == 3);
+        GGML_ASSERT(t_pooled->ne[1] == 2);
+        GGML_ASSERT(t_pooled->ne[2] == 1);
+
+        struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+        ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+        const float * output = ggml_get_data_f32(t_pooled);
+        GGML_ASSERT(output[0] == 3);
+        GGML_ASSERT(output[1] == 6);
+        GGML_ASSERT(output[2] == 9);
+        GGML_ASSERT(output[3] == 13);
+        GGML_ASSERT(output[4] == 16);
+        GGML_ASSERT(output[5] == 19);
+
+        ggml_free(ctx);
+    }
+
+    // avg pool 2d
+    {
+        struct ggml_context * ctx = make_ctx();
+        struct ggml_tensor * t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2);
+        memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+        struct ggml_tensor * t_pooled = ggml_pool_2d(ctx, t, GGML_OP_POOL_AVG, 3, 4, 3, 4, 0, 0);
+        GGML_ASSERT(t_pooled->ne[0] == 3);
+        GGML_ASSERT(t_pooled->ne[1] == 2);
+        GGML_ASSERT(t_pooled->ne[2] == 2);
+        GGML_ASSERT(t_pooled->ne[3] == 1);
+
+        struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+        ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+        const float * output = ggml_get_data_f32(t_pooled);
+        GGML_ASSERT(output[0] == 17);
+        GGML_ASSERT(output[1] == 20);
+        GGML_ASSERT(output[2] == 23);
+        GGML_ASSERT(output[3] == 57);
+        GGML_ASSERT(output[4] == 60);
+        GGML_ASSERT(output[5] == 63);
+        GGML_ASSERT(output[6] == 117);
+        GGML_ASSERT(output[7] == 120);
+        GGML_ASSERT(output[8] == 123);
+        GGML_ASSERT(output[9] == 157);
+        GGML_ASSERT(output[10] == 160);
+        GGML_ASSERT(output[11] == 163);
+
+
+        ggml_free(ctx);
+    }
+
+    // max pool 2d
+    {
+        struct ggml_context * ctx = make_ctx();
+        struct ggml_tensor * t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2);
+        memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+        struct ggml_tensor * t_pooled = ggml_pool_2d(ctx, t, GGML_OP_POOL_MAX, 3, 4, 3, 4, 0, 0);
+        GGML_ASSERT(t_pooled->ne[0] == 3);
+        GGML_ASSERT(t_pooled->ne[1] == 2);
+        GGML_ASSERT(t_pooled->ne[2] == 2);
+        GGML_ASSERT(t_pooled->ne[3] == 1);
+
+        struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+        ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+        const float * output = ggml_get_data_f32(t_pooled);
+        GGML_ASSERT(output[0] == 33);
+        GGML_ASSERT(output[1] == 36);
+        GGML_ASSERT(output[2] == 39);
+        GGML_ASSERT(output[3] == 73);
+        GGML_ASSERT(output[4] == 76);
+        GGML_ASSERT(output[5] == 79);
+        GGML_ASSERT(output[6] == 133);
+        GGML_ASSERT(output[7] == 136);
+        GGML_ASSERT(output[8] == 139);
+        GGML_ASSERT(output[9] == 173);
+        GGML_ASSERT(output[10] == 176);
+        GGML_ASSERT(output[11] == 179);
+
+        ggml_free(ctx);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-quantize-fns.cpp b/stable-diffusion.cpp/ggml/tests/test-quantize-fns.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..884af40548fb7912cd2e80c3c7e503bba938c06b
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-quantize-fns.cpp
@@ -0,0 +1,166 @@
+// Unit tests for quantization specific functions - quantize, dequantize and dot product
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
+constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
+
+static const char* RESULT_STR[] = {"ok", "FAILED"};
+
+
+// Generate synthetic data
+static void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+// Calculate RMSE between two float arrays
+static float array_rmse(const float * a1, const float * a2, size_t n) {
+    double sum = 0;
+    for (size_t i = 0; i < n; i++) {
+        double diff = a1[i] - a2[i];
+        sum += diff * diff;
+    }
+    return sqrtf(sum) / n;
+}
+
+// Total quantization error on test data
+static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+
+    qfns.from_float(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    return array_rmse(test_data, tmp_out.data(), test_size);
+}
+
+// Total quantization error on test data
+static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+    std::vector<float> tmp_out_ref(test_size);
+
+    qfns.from_float(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+
+    qfns.from_float_reference(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+
+    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
+}
+
+static float dot_product(const float * a1, const float * a2, size_t test_size) {
+    double sum = 0;
+    for (size_t i = 0; i < test_size; i++) {
+        sum += a1[i] * a2[i];
+    }
+    return sum;
+}
+
+// Total dot product error
+static float dot_product_error(
+    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
+) {
+    std::vector<uint8_t> tmp_q1(2*test_size);
+    std::vector<uint8_t> tmp_q2(2*test_size);
+
+    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+
+    qfns.from_float(test_data1, tmp_q1.data(), test_size);
+    vdot.from_float(test_data2, tmp_q2.data(), test_size);
+
+    float result = INFINITY;
+    qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
+
+    const float dot_ref = dot_product(test_data1, test_data2, test_size);
+
+    return fabsf(result - dot_ref) / test_size;
+}
+
+int main(int argc, char * argv[]) {
+    bool verbose = false;
+    const size_t test_size = 32 * 128;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-v") {
+            verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+
+    std::vector<float> test_data(test_size);
+    std::vector<float> test_data2(test_size);
+
+    generate_data(0.0, test_data.size(), test_data.data());
+    generate_data(1.0, test_data2.size(), test_data2.data());
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    int num_failed = 0;
+    bool failed = false;
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+
+        if (qfns.from_float && qfns.to_float) {
+            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
+            const float max_quantization_error =
+                type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+                type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR;
+            failed = !(total_error < max_quantization_error);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
+            }
+
+            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
+            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
+            }
+
+            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
+            failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
+            }
+        }
+    }
+
+    if (num_failed || verbose) {
+        printf("%d tests failed\n", num_failed);
+    }
+
+    ggml_free(ctx);
+
+    return num_failed > 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-quantize-perf.cpp b/stable-diffusion.cpp/ggml/tests/test-quantize-perf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88fac0e23106bc4f11c4899da9c12479a29c352a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-quantize-perf.cpp
@@ -0,0 +1,361 @@
+// Benchmark quantization specific functions on synthetic data
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <inttypes.h>
+#include <math.h>
+#include <memory>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define MAX_ALIGNMENT 64
+#define QK 32
+#define WARMUP 5
+#define ITERATIONS 10
+#define MAX_ITERATIONS 100000000
+
+#define L1_SIZE      32*128
+#define L2_SIZE     32*2048
+#define L3_SIZE    32*20480
+#define MEM_SIZE 32*2048000
+
+struct quantize_perf_params {
+    std::vector<std::string> include_types;
+    std::vector<size_t> test_sizes;
+    size_t alignment_offset = 0;
+    bool op_quantize_row_q_reference = false;
+    bool op_quantize_row_q = false;
+    bool op_dequantize_row_q = false;
+    bool op_quantize_row_q_dot = false;
+    bool op_vec_dot_q = false;
+    int64_t iterations = ITERATIONS;
+};
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#include <x86intrin.h>
+inline int64_t cpu_cycles() {
+// Rough way to detect new-ish CPUs
+#ifdef __POPCNT__
+    unsigned int dummy;
+    return __rdtscp(&dummy);
+#else
+    return __rdtsc();
+#endif
+}
+
+#else
+
+#define cpu_cycles() 0
+
+#endif
+
+
+// Generate synthetic data
+static void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+static float gigabytes_per_second(size_t bytes, int64_t usecs) {
+    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
+}
+
+static void * align_with_offset(void * ptr, int offset) {
+    size_t dummy_size = MAX_ALIGNMENT * 4;
+    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
+}
+
+static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
+    int64_t min_time_us = INT64_MAX;
+    int64_t total_time_us = 0;
+    int64_t min_time_cycles = INT64_MAX;
+    int64_t total_time_cycles = 0;
+
+    for (int i = 0; i < WARMUP; i++) {
+        func();
+    }
+
+    for (int i = 0; i < iterations; i++) {
+        const int64_t start_time = ggml_time_us();
+        const int64_t start_cycles = cpu_cycles();
+
+        func();
+
+        const int64_t end_cycles = cpu_cycles();
+        const int64_t end_time = ggml_time_us();
+
+        total_time_cycles += end_cycles - start_cycles;
+        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
+        total_time_us += end_time - start_time;
+        min_time_us = std::min(min_time_us, end_time - start_time);
+    }
+
+    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
+    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
+    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
+    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
+}
+
+static void usage(char * argv[]) {
+    printf("Benchmark quantization specific functions on synthetic data\n");
+    printf("\n");
+    printf("usage: %s [options]\n", argv[0]);
+    printf("\n");
+    printf("options: (default)\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
+    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
+    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
+    printf("  --op OP               set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
+    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
+    printf("  --type TYPE           set test type as");
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        if (ggml_type_name(type) != NULL) {
+            if (qfns.from_float && qfns.to_float) {
+                printf(" %s", ggml_type_name(type));
+            }
+        }
+    }
+    printf(" (all)\n");
+    printf("  --alignment-offset OFFSET\n");
+    printf("                        set alignment offset as OFFSET (0)\n");
+    printf("  -i NUM, --iterations NUM\n");
+    printf("                        set test iteration number (%d)\n", ITERATIONS);
+}
+
+int main(int argc, char * argv[]) {
+    quantize_perf_params params {};
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "--size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            size_t size = std::stoi(argv[i]);
+            if (size % 32 != 0) {
+                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
+                invalid_param = true;
+                break;
+            }
+            params.test_sizes.push_back(size);
+        } else if (arg == "-3") {
+            // quick select sizes that probably fit in CPU caches
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+        } else if (arg == "-4") {
+            // quick select cache sizes + memory
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+            params.test_sizes.push_back(MEM_SIZE);
+        } else if (arg == "--op") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string op {argv[i]};
+            if (op == "quantize_row_q_reference") {
+                params.op_quantize_row_q_reference = true;
+            } else if (op == "quantize_row_q") {
+                params.op_quantize_row_q = true;
+            } else if (op == "dequantize_row_q") {
+                params.op_dequantize_row_q = true;
+            } else if (op == "quantize_row_q_dot") {
+                params.op_quantize_row_q_dot = true;
+            } else if (op == "vec_dot_q") {
+                params.op_vec_dot_q = true;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_types.push_back(argv[i]);
+        } else if (arg == "--alignment-offset") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int alignment = std::stoi(argv[i]);
+            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
+            fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
+                invalid_param = true;
+                break;
+            }
+            params.alignment_offset = alignment;
+        } else if ((arg == "-i") || (arg == "--iterations")) {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int number = std::stoi(argv[i]);
+            if (number < 0 || number > MAX_ITERATIONS) {
+            fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
+                invalid_param = true;
+                break;
+            }
+            params.iterations = number;
+        } else if ((arg == "-h") || (arg == "--help")) {
+            usage(argv);
+            return 1;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return 1;
+    }
+
+    if (params.test_sizes.empty()) {
+        params.test_sizes.push_back(L1_SIZE);
+    }
+    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
+        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
+    }
+
+    std::sort(params.test_sizes.begin(), params.test_sizes.end());
+    size_t largest = params.test_sizes.back();
+
+    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
+
+    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
+    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
+    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
+    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
+    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
+
+    generate_data(0, largest, test_data1);
+    generate_data(1, largest, test_data2);
+
+    int64_t iterations = params.iterations;
+
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
+            continue;
+        }
+
+        if (qfns.from_float && qfns.to_float) {
+            printf("%s\n", ggml_type_name(type));
+
+            if (params.op_quantize_row_q_reference) {
+                printf("  quantize_row_q_reference\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns.from_float_reference(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q) {
+                printf("  quantize_row_q\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns.from_float(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_dequantize_row_q) {
+                printf("  dequantize_row_q\n");
+                qfns.from_float(test_data1, test_q1, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns.to_float(test_q1, test_out, size);
+                        return test_out[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q_dot) {
+                printf("  quantize_row_q_dot\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+                        vdot.from_float(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_vec_dot_q) {
+                printf("  vec_dot_q\n");
+                qfns.from_float(test_data1, test_q1, largest);
+                qfns.from_float(test_data2, test_q2, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        float result;
+                        qfns.vec_dot(size, &result, test_q1, test_q2);
+                        return result;
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+        }
+    }
+
+    ggml_free(ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-rel-pos.c b/stable-diffusion.cpp/ggml/tests/test-rel-pos.c
new file mode 100644
index 0000000000000000000000000000000000000000..19960b453ea950c19141b5df217fb49384a64558
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-rel-pos.c
@@ -0,0 +1,84 @@
+#include "ggml/ggml.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct ggml_context* make_ctx(void) {
+    struct ggml_init_params params = {
+        .mem_size = 2 * 1024 * 1024,
+    };
+
+    return ggml_init(params);
+}
+
+void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) {
+    GGML_ASSERT(t->type == GGML_TYPE_F32);
+    GGML_ASSERT(t->ne[0] == ne0);
+    GGML_ASSERT(t->ne[1] == ne1);
+    GGML_ASSERT(t->ne[2] == ne2);
+    for (int i2 = 0; i2 < ne2; ++i2) {
+        for (int i1 = 0; i1 < ne1; ++i1) {
+            for (int i0 = 0; i0 < ne0; ++i0) {
+                float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0);
+                float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0];
+                GGML_ASSERT(expected == actual);
+            }
+        }
+    }
+}
+
+int main(int argc, const char** argv) {
+    ggml_fp16_t buf_f16[1024];
+    for (int i = 0; i < 1024; ++i) {
+        buf_f16[i] = ggml_fp32_to_fp16((float)i);
+    }
+
+    float expected_out[4][9] = {
+        { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 },
+        { 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 4.0, 5.0, 6.0 },
+        { 14.0, 15.0, 16.0, 15.0, 16.0, 17.0, 16.0, 17.0, 18.0 },
+        { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 },
+    };
+
+    {
+        struct ggml_context * ctx = make_ctx();
+
+
+        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3);
+        ggml_fp16_t* t_d = (ggml_fp16_t*)t->data;
+        memcpy(t_d, buf_f16, ggml_nbytes(t));
+
+        struct ggml_tensor * t_2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3);
+        ggml_fp16_t* t_d_2 = (ggml_fp16_t*)t_2->data;
+        memcpy(t_d_2, buf_f16 + 1, ggml_nbytes(t_2));
+
+        struct ggml_tensor * rw = ggml_get_rel_pos(ctx, t, 2, 2);
+        struct ggml_tensor * rh = ggml_get_rel_pos(ctx, t_2, 2, 2);
+
+        struct ggml_tensor * rw_f32 = ggml_cpy(ctx, rw, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2));
+        struct ggml_tensor * rh_f32 = ggml_cpy(ctx, rh, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2));
+
+        struct ggml_tensor * in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 9, 4);
+        struct ggml_tensor * out_inplace = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 9, 4);
+        float * in_d          = (float*)in->data;
+        float * out_inplace_d = (float*)out_inplace->data;
+        for (int i = 0; i < ggml_nelements(in); ++i) {
+            in_d[i]          = 1.f;
+            out_inplace_d[i] = 1.f;
+        }
+
+        struct ggml_tensor * out = ggml_add_rel_pos(ctx, in, rw_f32, rh_f32);
+        struct ggml_cgraph gf = ggml_build_forward(out);
+        ggml_graph_compute_with_ctx(ctx, &gf, 1);
+
+        out_inplace = ggml_add_rel_pos_inplace(ctx, out_inplace, rw_f32, rh_f32);
+        struct ggml_cgraph gf_2 = ggml_build_forward(out_inplace);
+        ggml_graph_compute_with_ctx(ctx, &gf_2, 1);
+
+        check_tensor(out, (float*)expected_out, 9, 4, 1);
+        check_tensor(out_inplace, (float*)expected_out, 9, 4, 1);
+    }
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-svd0.c b/stable-diffusion.cpp/ggml/tests/test-svd0.c
new file mode 100644
index 0000000000000000000000000000000000000000..8160bd3b9c7780197ba2c37177d0ae738d84a264
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-svd0.c
@@ -0,0 +1,218 @@
+// SVD dimensionality reduction
+
+#include <float.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include <sys/time.h>
+
+#ifdef GGML_USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#endif
+
+float frand(void) {
+    return (float) rand() / (float) RAND_MAX;
+}
+
+//int sgesvd_(char *__jobu, char *__jobvt, __CLPK_integer *__m,
+//        __CLPK_integer *__n, __CLPK_real *__a, __CLPK_integer *__lda,
+//        __CLPK_real *__s, __CLPK_real *__u, __CLPK_integer *__ldu,
+//        __CLPK_real *__vt, __CLPK_integer *__ldvt, __CLPK_real *__work,
+//        __CLPK_integer *__lwork,
+//        __CLPK_integer *__info)
+
+int main(int argc, const char ** argv) {
+    int m = 10;
+    int n = 5;
+
+    float * A  = malloc(n * m * sizeof(float));
+    float * A0 = malloc(n * m * sizeof(float));
+
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < m; ++j) {
+            A[i * m + j] = (float) (10.0f*(i + 1) + 1.0f * frand());
+            //A[i * m + j] = (float) (10.0f*(i%2 + 1) + 0.1f * frand());
+            //if (i == 2) {
+            //    A[i * m + j] += 20*frand();
+            //}
+            if ((i == 1 || i == 3) && j > m/2) {
+                A[i * m + j] = -A[i * m + j];
+            }
+        }
+    }
+
+    // average vector
+    //float * M = malloc(m * sizeof(float));
+
+    //{
+    //    for (int j = 0; j < m; ++j) {
+    //        M[j] = 0.0f;
+    //    }
+    //    for (int i = 0; i < n; ++i) {
+    //        for (int j = 0; j < m; ++j) {
+    //            M[j] += A[i * m + j];
+    //        }
+    //    }
+    //    for (int j = 0; j < m; ++j) {
+    //        M[j] /= (float) n;
+    //    }
+    //}
+
+    //// subtract average vector
+    //for (int i = 0; i < n; ++i) {
+    //    for (int j = 0; j < m; ++j) {
+    //        A[i * m + j] -= M[j];
+    //    }
+    //}
+
+    memcpy(A0, A, n * m * sizeof(float));
+
+    // print A
+    printf("A:\n");
+    for (int i = 0; i < n; ++i) {
+        printf("col %d : ", i);
+        for (int j = 0; j < m; ++j) {
+            printf("%9.5f ", A[i * m + j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    // SVD
+    // A = U * S * V^T
+
+    float * U = malloc(n * m * sizeof(float));
+    float * S = malloc(n * sizeof(float));
+    float * V = malloc(n * n * sizeof(float));
+
+    int lda = m;
+    int ldu = m;
+    int ldvt = n;
+
+    float work_size;
+    int lwork = -1;
+    int info = 0;
+
+    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info);
+
+    lwork = (int) work_size;
+
+    printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork);
+
+    float * work = malloc(lwork * sizeof(float));
+
+    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info);
+
+    // print U
+    printf("U:\n");
+    for (int i = 0; i < n; ++i) {
+        printf("col %d : ", i);
+        for (int j = 0; j < m; ++j) {
+            printf("%9.5f ", U[i * m + j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    // normalize S
+    {
+        double sum = 0.0;
+        for (int i = 0; i < n; ++i) {
+            sum += S[i];
+        }
+        sum *= sqrt((double) m);
+        for (int i = 0; i < n; ++i) {
+            S[i] /= sum;
+        }
+    }
+
+    // print S
+    printf("S:\n");
+    for (int i = 0; i < n; ++i) {
+        printf("- %d = %9.5f\n", i, S[i]);
+    }
+    printf("\n");
+
+    // print V
+    printf("V:\n");
+    for (int i = 0; i < n; ++i) {
+        printf("col %d : ", i);
+        for (int j = 0; j < n; ++j) {
+            printf("%9.5f ", V[i * n + j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    // print A
+    printf("A:\n");
+    for (int i = 0; i < n; ++i) {
+        printf("col %d : ", i);
+        for (int j = 0; j < m; ++j) {
+            printf("%9.5f ", A[i * m + j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    // compute singular vectors in U
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < m; ++j) {
+            U[i * m + j] *= S[i];
+        }
+    }
+
+    // normalize U
+    for (int i = 0; i < n; ++i) {
+        double sum = 0.0;
+        for (int j = 0; j < m; ++j) {
+            sum += U[i * m + j] * U[i * m + j];
+        }
+        sum = sqrt(sum);
+        for (int j = 0; j < m; ++j) {
+            U[i * m + j] /= sum*sqrt((double) m);
+        }
+    }
+
+    // print U
+    printf("U:\n");
+    for (int i = 0; i < n; ++i) {
+        printf("col %d : ", i);
+        for (int j = 0; j < m; ++j) {
+            printf("%9.5f ", U[i * m + j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+
+    // project A0 onto U
+    float * A1 = malloc(n * n * sizeof(float));
+
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            A1[i * n + j] = 0.0f;
+            for (int k = 0; k < m; ++k) {
+                A1[i * n + j] += A0[i * m + k] * U[j * m + k];
+            }
+        }
+    }
+
+    // print A1
+    printf("A1:\n");
+    for (int i = 0; i < n; ++i) {
+        printf("col %d : ", i);
+        for (int j = 0; j < n; ++j) {
+            printf("%9.5f ", A1[i * n + j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-vec0.c b/stable-diffusion.cpp/ggml/tests/test-vec0.c
new file mode 100644
index 0000000000000000000000000000000000000000..5e23f8ebab62404beccdb4c9e0ada861e56a308a
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-vec0.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+const int N = 1 << 14;
+const int M = 1 << 14;
+
+void mul_mat_vec_f32_0(
+    const float * src0,
+    const float * src1,
+    float * dst,
+    unsigned nrows,
+    unsigned ncols) {
+    for (unsigned i = 0; i < nrows; i++) {
+        float sum = 0.0f;
+        for (unsigned j = 0; j < ncols; j++) {
+            sum += src0[i*ncols + j]*src1[j];
+        }
+        dst[i] = sum;
+    }
+}
+#if defined(_MSC_VER)
+typedef float __declspec(align(32)) afloat;
+#else
+typedef float afloat __attribute__((__aligned__(32)));
+#endif
+void mul_mat_vec_f32_1(
+    const afloat *restrict src0,
+    const afloat *restrict src1,
+    afloat *restrict dst,
+    unsigned nrows,
+    unsigned ncols) {
+    for (unsigned i = 0; i < nrows; i++) {
+        const afloat * restrict row = src0 + i*ncols;
+        const afloat * restrict col = src1;
+
+        float sum = 0.0f;
+
+        for (unsigned j = 0; j < ncols; j++) {
+            sum += *row++ * *col++;
+        }
+
+        dst[i] = sum;
+
+        //float sum[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+        //for (unsigned j = 0; j < ncols; j += 8) {
+        //    sum[0] += row[0]*col[0];
+        //    sum[1] += row[1]*col[1];
+        //    sum[2] += row[2]*col[2];
+        //    sum[3] += row[3]*col[3];
+        //    sum[4] += row[4]*col[4];
+        //    sum[5] += row[5]*col[5];
+        //    sum[6] += row[6]*col[6];
+        //    sum[7] += row[7]*col[7];
+
+        //    row += 8;
+        //    col += 8;
+        //}
+
+        //dst[i] = sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7];
+    }
+}
+
+void mul_mat_vec_f32_2(
+    const void * src0,
+    const void * src1,
+    void * dst,
+    unsigned nrows,
+    unsigned ncols) {
+    void * d = dst;
+    for (unsigned i = 0; i < nrows; i++) {
+        float sum = 0.0f;
+
+        const char * row = (const char*)src0 + i*ncols*sizeof(float);
+        const char * col = (const char*)src1;
+        for (unsigned j = 0; j < ncols; j++) {
+            sum += (*(float *)row) * (*(float *)col);
+            row += sizeof(float);
+            col += sizeof(float);
+        }
+        *(float *)d = sum;
+        d = (char*)d + sizeof(float);
+    }
+}
+
+#if defined(_MSC_VER)
+void* aligned_alloc(size_t alignment, size_t size) {
+    return _aligned_malloc(size, alignment);
+}
+#endif
+
+int main(int argc, const char ** argv) {
+    //float * src0 = malloc(sizeof(float)*N*M);
+    //float * src1 = malloc(sizeof(float)*M);
+    //float * dst  = malloc(sizeof(float)*N);
+
+    afloat * src0 = (float *)(aligned_alloc(32, sizeof(float)*N*M));
+    afloat * src1 = (float *)(aligned_alloc(32, sizeof(float)*M));
+    afloat * dst  = (float *)(aligned_alloc(32, sizeof(float)*N));
+
+    for (int i = 0; i < N*M; i++) {
+        src0[i] = (afloat)i;
+    }
+
+    for (int i = 0; i < M; i++) {
+        src1[i] = (afloat)i;
+    }
+
+    const int nIter = 10;
+
+    const clock_t start = clock();
+
+    double sum = 0.0f;
+    for (int i = 0; i < nIter; i++) {
+        //mul_mat_vec_f32_0(src0, src1, dst, N, M);
+        mul_mat_vec_f32_1(src0, src1, dst, N, M);
+        //mul_mat_vec_f32_2(src0, src1, dst, N, M);
+        for (int  i = 0; i < N; i++) {
+            sum += dst[i];
+        }
+    }
+
+    {
+        const clock_t end = clock();
+        printf("%s: elapsed ticks: %ld\n", __func__, end - start);
+    }
+
+    printf("%f\n", sum);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-vec1.c b/stable-diffusion.cpp/ggml/tests/test-vec1.c
new file mode 100644
index 0000000000000000000000000000000000000000..567cb061740d5c4c922df90015f2c2852b5fbd06
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-vec1.c
@@ -0,0 +1,576 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#include <sys/time.h>
+
+#include <immintrin.h>
+
+const int N = 1 << 14;
+const int M = 768;
+
+//
+// naive implementation
+//
+
+void mul_mat_vec_f32_0(
+    const float * restrict src0,
+    const float * restrict src1,
+    float * dst,
+    int nrows,
+    int ncols) {
+    for (int i = 0; i < nrows; i++) {
+        float sum = 0.0f;
+        for (int j = 0; j < ncols; j++) {
+            sum += src0[i*ncols + j]*src1[j];
+        }
+        dst[i] = sum;
+    }
+}
+
+//
+// SIMD with 8 32-bit floats
+//
+
+float reduce_vector8_0(__m256 v) {
+    __m128 v1 = _mm256_extractf128_ps(v, 0);
+    __m128 v2 = _mm256_extractf128_ps(v, 1);
+    __m128 v3 = _mm_add_ps(v1, v2);
+    __m128 v4 = _mm_shuffle_ps(v3, v3, 0x4e);
+    __m128 v5 = _mm_add_ps(v3, v4);
+    __m128 v6 = _mm_shuffle_ps(v5, v5, 0x11);
+    __m128 v7 = _mm_add_ps(v5, v6);
+    return _mm_cvtss_f32(v7);
+}
+
+// vectorized implementation using AVX
+void mul_mat_vec_f32_1(
+    const float * restrict src0,
+    const float * restrict src1,
+    float * dst,
+    int nrows,
+    int ncols) {
+
+    const int ncols8 = ncols & ~7;
+
+    for (int i = 0; i < nrows; i++) {
+        __m256 sum = _mm256_setzero_ps();
+        for (int j = 0; j < ncols8; j += 8) {
+            __m256 a = _mm256_loadu_ps(src0 + i*ncols + j);
+            __m256 b = _mm256_loadu_ps(src1 + j);
+            __m256 c = _mm256_mul_ps(a, b);
+            sum = _mm256_add_ps(sum, c);
+        }
+        dst[i] = reduce_vector8_0(sum);
+
+        for (int j = ncols8; j < ncols; j++) {
+            dst[i] += src0[i*ncols + j]*src1[j];
+        }
+    }
+}
+
+void mul_mat_vec_f32_2(
+    const float * restrict src0,
+    const float * restrict src1,
+    float * dst,
+    int nrows,
+    int ncols) {
+
+    const int ncols32 = ncols & ~31;
+
+    for (int i = 0; i < nrows; i++) {
+        __m256 sum0 = _mm256_setzero_ps();
+        __m256 sum1 = _mm256_setzero_ps();
+        __m256 sum2 = _mm256_setzero_ps();
+        __m256 sum3 = _mm256_setzero_ps();
+
+        const float * restrict src0_row = src0 + i*ncols;
+        for (int j = 0; j < ncols32; j += 32) {
+            __m256 a0 = _mm256_loadu_ps(src0_row + j + 0);
+            __m256 a1 = _mm256_loadu_ps(src0_row + j + 8);
+            __m256 a2 = _mm256_loadu_ps(src0_row + j + 16);
+            __m256 a3 = _mm256_loadu_ps(src0_row + j + 24);
+            __m256 b0 = _mm256_loadu_ps(src1 + j + 0);
+            __m256 b1 = _mm256_loadu_ps(src1 + j + 8);
+            __m256 b2 = _mm256_loadu_ps(src1 + j + 16);
+            __m256 b3 = _mm256_loadu_ps(src1 + j + 24);
+#if defined(__FMA__)
+            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
+            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
+            sum2 = _mm256_fmadd_ps(a2, b2, sum2);
+            sum3 = _mm256_fmadd_ps(a3, b3, sum3);
+#else
+            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+            sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
+            sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
+#endif
+        }
+        dst[i] = reduce_vector8_0(_mm256_add_ps(_mm256_add_ps(sum0, sum1), _mm256_add_ps(sum2, sum3)));
+
+        for (int j = ncols32; j < ncols; j++) {
+            dst[i] += src0[i*ncols + j]*src1[j];
+        }
+    }
+}
+
+//
+// SIMD with 8 16-bit floats
+//
+
+static inline float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+    return as_float(w);
+#elif defined(__CUDA_ARCH__)
+    return __uint_as_float((unsigned int) w);
+#elif defined(__INTEL_COMPILER)
+    return _castu32_f32(w);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+    return _CopyFloatFromInt32((__int32) w);
+#else
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32 = { w };
+    return fp32.as_value;
+#endif
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+	return as_uint(f);
+#elif defined(__CUDA_ARCH__)
+	return (uint32_t) __float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+	return _castf32_u32(f);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return (uint32_t) _CopyInt32FromFloat(f);
+#else
+	union {
+		float as_value;
+		uint32_t as_bits;
+	} fp32 = { f };
+	return fp32.as_bits;
+#endif
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline float fp16_ieee_to_fp32_value(uint16_t h) {
+    /*
+     * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+     *      +---+-----+------------+-------------------+
+     *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+     *      +---+-----+------------+-------------------+
+     * Bits  31  26-30    16-25            0-15
+     *
+     * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+     */
+    const uint32_t w = (uint32_t) h << 16;
+    /*
+     * Extract the sign of the input number into the high bit of the 32-bit word:
+     *
+     *      +---+----------------------------------+
+     *      | S |0000000 00000000 00000000 00000000|
+     *      +---+----------------------------------+
+     * Bits  31                 0-31
+     */
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    /*
+     * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
+     *
+     *      +-----+------------+---------------------+
+     *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+     *      +-----+------------+---------------------+
+     * Bits  27-31    17-26            0-16
+     */
+    const uint32_t two_w = w + w;
+
+    /*
+     * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
+     * of a single-precision floating-point number:
+     *
+     *       S|Exponent |          Mantissa
+     *      +-+---+-----+------------+----------------+
+     *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+     *      +-+---+-----+------------+----------------+
+     * Bits   | 23-31   |           0-22
+     *
+     * Next, there are some adjustments to the exponent:
+     * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
+     *   formats (0x7F - 0xF = 0x70)
+     * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
+     *   Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
+     *   of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
+     *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
+     *     by the difference in the exponent bias (see above).
+     *   - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
+     *     exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
+     *     The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
+     *     partially IEEE754-compliant implementations.
+     *
+     * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
+     * operate on denormal inputs, and do not produce denormal results.
+     */
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    /*
+     * Convert denormalized half-precision inputs into single-precision results (always normalized).
+     * Zero inputs are also handled here.
+     *
+     * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
+     * First, we shift mantissa into bits 0-9 of the 32-bit word.
+     *
+     *                  zeros           |  mantissa
+     *      +---------------------------+------------+
+     *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+     *      +---------------------------+------------+
+     * Bits             10-31                0-9
+     *
+     * Now, remember that denormalized half-precision numbers are represented as:
+     *    FP16 = mantissa * 2**(-24).
+     * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
+     * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
+     * A normalized single-precision floating-point number is represented as:
+     *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
+     * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
+     * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
+     *
+     * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
+     * is zero, the constructed single-precision number has the value of
+     *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
+     * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
+     * the input half-precision number.
+     */
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    /*
+     * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
+     *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
+     *   input is either a denormal number, or zero.
+     * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
+     */
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
+ * IEEE half-precision format, in bit representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+void mul_mat_vec_f16_0(
+    const uint16_t * src0,
+    const uint16_t * src1,
+             float * dst,
+    int nrows,
+    int ncols) {
+
+    const int ncols8 = ncols & ~7;
+
+    for (int i = 0; i < nrows; i++) {
+        __m256 sum = _mm256_setzero_ps();
+
+        const uint16_t * src0_row = src0 + i * ncols;
+        for (int j = 0; j < ncols8; j += 8) {
+            __m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j)));
+            __m256 b = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
+#if defined(__FMA__)
+            sum = _mm256_fmadd_ps(a, b, sum);
+#else
+            sum = _mm256_add_ps(_mm256_mul_ps(a, b), sum);
+#endif
+        }
+        dst[i] = reduce_vector8_0(sum);
+
+        for (int j = ncols8; j < ncols; j++) {
+            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
+        }
+    }
+}
+
+void mul_mat_vec_f16_1(
+    const uint16_t * src0,
+    const uint16_t * src1,
+             float * dst,
+    int nrows,
+    int ncols) {
+
+    const int ncols16 = ncols & ~15;
+
+    for (int i = 0; i < nrows; i++) {
+        __m256 sum0 = _mm256_setzero_ps();
+        __m256 sum1 = _mm256_setzero_ps();
+
+        const uint16_t * src0_row = src0 + i * ncols;
+        for (int j = 0; j < ncols16; j += 16) {
+            __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0)));
+            __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
+            __m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
+            __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
+#if defined(__FMA__)
+            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
+            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
+#else
+            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+#endif
+        }
+        dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1);
+
+        for (int j = ncols16; j < ncols; j++) {
+            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
+        }
+    }
+}
+
+void mul_mat_vec_f16_2(
+    const uint16_t * src0,
+    const uint16_t * src1,
+             float * dst,
+    int nrows,
+    int ncols) {
+
+    const int ncols32 = ncols & ~31;
+
+    for (int i = 0; i < nrows; i++) {
+        __m256 sum0 = _mm256_setzero_ps();
+        __m256 sum1 = _mm256_setzero_ps();
+        __m256 sum2 = _mm256_setzero_ps();
+        __m256 sum3 = _mm256_setzero_ps();
+
+        const uint16_t * src0_row = src0 + i * ncols;
+        for (int j = 0; j < ncols32; j += 32) {
+            __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0)));
+            __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
+            __m256 a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 16)));
+            __m256 a3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 24)));
+            __m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
+            __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
+            __m256 b2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 16)));
+            __m256 b3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 24)));
+#if defined(__FMA__)
+            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
+            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
+            sum2 = _mm256_fmadd_ps(a2, b2, sum2);
+            sum3 = _mm256_fmadd_ps(a3, b3, sum3);
+#else
+            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+            sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
+            sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
+#endif
+        }
+        dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);
+
+        for (int j = ncols32; j < ncols; j++) {
+            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
+        }
+    }
+}
+
+void mul_mat_vec_f16_3(
+    const uint16_t * src0,
+    const    float * src1,
+             float * dst,
+    int nrows,
+    int ncols) {
+
+    const int ncols32 = ncols & ~31;
+
+    for (int i = 0; i < nrows; i++) {
+        __m256 sum0 = _mm256_setzero_ps();
+        __m256 sum1 = _mm256_setzero_ps();
+        __m256 sum2 = _mm256_setzero_ps();
+        __m256 sum3 = _mm256_setzero_ps();
+
+        const uint16_t * src0_row = src0 + i * ncols;
+        for (int j = 0; j < ncols32; j += 32) {
+            __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0)));
+            __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
+            __m256 a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 16)));
+            __m256 a3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 24)));
+            __m256 b0 = _mm256_loadu_ps(src1 + j);
+            __m256 b1 = _mm256_loadu_ps(src1 + j + 8);
+            __m256 b2 = _mm256_loadu_ps(src1 + j + 16);
+            __m256 b3 = _mm256_loadu_ps(src1 + j + 24);
+#if defined(__FMA__)
+            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
+            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
+            sum2 = _mm256_fmadd_ps(a2, b2, sum2);
+            sum3 = _mm256_fmadd_ps(a3, b3, sum3);
+#else
+            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
+            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
+            sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
+            sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
+#endif
+        }
+        dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);
+
+        for (int j = ncols32; j < ncols; j++) {
+            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
+        }
+    }
+}
+
+uint64_t get_time_us(void) {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+int main(int argc, const char ** argv) {
+    float * src0 = malloc(sizeof(float)*N*M);
+    float * src1 = malloc(sizeof(float)*M);
+    float * dst  = malloc(sizeof(float)*N);
+
+    //float * src0 = (float *)(aligned_alloc(64, sizeof(float)*N*M));
+    //float * src1 = (float *)(aligned_alloc(64, sizeof(float)*M));
+    //float * dst  = (float *)(aligned_alloc(64, sizeof(float)*N));
+
+    for (int i = 0; i < N*M; i++) {
+        src0[i] = rand() / (float)RAND_MAX;
+    }
+
+    for (int i = 0; i < M; i++) {
+        src1[i] = rand() / (float)RAND_MAX;
+    }
+
+    // convert src0 and src1 to __fp16
+    uint16_t * src0_fp16 = (uint16_t *)(malloc(sizeof(uint16_t)*N*M));
+    uint16_t * src1_fp16 = (uint16_t *)(malloc(sizeof(uint16_t)*M));
+    //uint16_t * src0_fp16 = (uint16_t *)(aligned_alloc(64, sizeof(uint16_t)*N*M));
+    //uint16_t * src1_fp16 = (uint16_t *)(aligned_alloc(64, sizeof(uint16_t)*M));
+
+    {
+        const uint64_t t_start = get_time_us();
+
+        for (int i = 0; i < N*M; i++) {
+            src0_fp16[i] = fp16_ieee_from_fp32_value(src0[i]);
+            //printf("%f %f\n", src0[i], fp16_ieee_to_fp32_value(src0_fp16[i]));
+            //assert(!isnan(fp16_ieee_to_fp32_value(src0_fp16[i])));
+        }
+
+        for (int i = 0; i < M; i++) {
+            src1_fp16[i] = fp16_ieee_from_fp32_value(src1[i]);
+        }
+
+        const uint64_t t_end = get_time_us();
+        printf("convert time: %f ms\n", (t_end - t_start) / 1000.0);
+    }
+
+    for (int i = 0; i < 16; ++i) {
+        printf("%f %f\n", src0[i], fp16_ieee_to_fp32_value(src0_fp16[i]));
+    }
+
+    int method = 0;
+    if (argc > 1) {
+        method = atoi(argv[1]);
+    }
+
+    const int nIter = 1000;
+
+    const clock_t start = clock();
+    const uint64_t start_us = get_time_us();
+
+    double iM = 1.0/M;
+    double sum = 0.0f;
+    for (int i = 0; i < nIter; i++) {
+        if (method == 0) {
+            mul_mat_vec_f32_0(src0, src1, dst, N, M);
+        }
+
+        if (method == 1) {
+            mul_mat_vec_f32_1(src0, src1, dst, N, M);
+        }
+
+        if (method == 2) {
+            mul_mat_vec_f32_2(src0, src1, dst, N, M);
+        }
+
+        if (method == 3) {
+            mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, N, M);
+        }
+
+        if (method == 4) {
+            mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, N, M);
+        }
+
+        if (method == 5) {
+            mul_mat_vec_f16_2(src0_fp16, src1_fp16, dst, N, M);
+        }
+
+        if (method == 6) {
+            mul_mat_vec_f16_3(src0_fp16, src1, dst, N, M);
+        }
+    }
+
+    for (int i = 0; i < N; i++) {
+        sum += dst[i]*iM;
+    }
+
+    {
+        const clock_t end = clock();
+        const uint64_t end_us = get_time_us();
+        printf("%s: elapsed ticks: %ld\n", __func__, end - start);
+        printf("%s: elapsed us: %ld\n", __func__, end_us - start_us);
+    }
+
+    printf("%f\n", sum);
+
+    free(src0);
+    free(src1);
+    free(dst);
+
+    free(src0_fp16);
+    free(src1_fp16);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-vec2.c b/stable-diffusion.cpp/ggml/tests/test-vec2.c
new file mode 100644
index 0000000000000000000000000000000000000000..4fa364ca5a5d322783c871b58dab8d74b10258fc
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-vec2.c
@@ -0,0 +1,268 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#include <sys/time.h>
+
+#include <arm_neon.h>
+
+const int N = 1 << 12;
+const int M = 1 << 12;
+
+//
+// naive implementation
+//
+
+void mul_mat_vec_f32_0(
+    const float * restrict src0,
+    const float * restrict src1,
+    float * dst,
+    int nrows,
+    int ncols) {
+    for (int i = 0; i < nrows; i++) {
+        float sum = 0.0f;
+        for (int j = 0; j < ncols; j++) {
+            sum += src0[i*ncols + j]*src1[j];
+        }
+        dst[i] = sum;
+    }
+}
+
+void mul_mat_vec_f16_0(
+    const __fp16 * src0,
+    const __fp16 * src1,
+           float * dst,
+    int nrows,
+    int ncols) {
+
+    const int n64 = ncols & ~63;
+
+    for (int r = 0; r < nrows; r++) {
+        float sumf = 0.0;
+
+        float16x8_t sum0 = vdupq_n_f16(0.0f);
+        float16x8_t sum1 = vdupq_n_f16(0.0f);
+        float16x8_t sum2 = vdupq_n_f16(0.0f);
+        float16x8_t sum3 = vdupq_n_f16(0.0f);
+        float16x8_t sum4 = vdupq_n_f16(0.0f);
+        float16x8_t sum5 = vdupq_n_f16(0.0f);
+        float16x8_t sum6 = vdupq_n_f16(0.0f);
+        float16x8_t sum7 = vdupq_n_f16(0.0f);
+
+        float16x8_t x0, x1, x2, x3, x4, x5, x6, x7;
+        float16x8_t y0, y1, y2, y3, y4, y5, y6, y7;
+
+        const __fp16 * restrict p0 = src0 + r*ncols;
+
+        for (int i = 0; i < n64; i += 64) {
+            x0 = vld1q_f16(p0 + i + 0 );
+            x1 = vld1q_f16(p0 + i + 8 );
+            x2 = vld1q_f16(p0 + i + 16);
+            x3 = vld1q_f16(p0 + i + 24);
+            x4 = vld1q_f16(p0 + i + 32);
+            x5 = vld1q_f16(p0 + i + 40);
+            x6 = vld1q_f16(p0 + i + 48);
+            x7 = vld1q_f16(p0 + i + 56);
+
+            y0 = vld1q_f16(src1 + i + 0 );
+            y1 = vld1q_f16(src1 + i + 8 );
+            y2 = vld1q_f16(src1 + i + 16);
+            y3 = vld1q_f16(src1 + i + 24);
+            y4 = vld1q_f16(src1 + i + 32);
+            y5 = vld1q_f16(src1 + i + 40);
+            y6 = vld1q_f16(src1 + i + 48);
+            y7 = vld1q_f16(src1 + i + 56);
+
+            sum0 = vfmaq_f16(sum0, x0, y0);
+            sum1 = vfmaq_f16(sum1, x1, y1);
+            sum2 = vfmaq_f16(sum2, x2, y2);
+            sum3 = vfmaq_f16(sum3, x3, y3);
+            sum4 = vfmaq_f16(sum4, x4, y4);
+            sum5 = vfmaq_f16(sum5, x5, y5);
+            sum6 = vfmaq_f16(sum6, x6, y6);
+            sum7 = vfmaq_f16(sum7, x7, y7);
+        }
+
+        // TODO: F16 - better way to reduce this ?
+        float16x8_t sum = vaddq_f16(sum0, sum1);
+
+        sum = vaddq_f16(sum, sum2);
+        sum = vaddq_f16(sum, sum3);
+        sum = vaddq_f16(sum, sum4);
+        sum = vaddq_f16(sum, sum5);
+        sum = vaddq_f16(sum, sum6);
+        sum = vaddq_f16(sum, sum7);
+
+        sumf += sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7];
+
+        for (int j = n64; j < n64; j++) {
+            sumf += src0[r*ncols + j]*src1[j];
+        }
+
+        dst[r] = sumf;
+    }
+}
+
+void mul_mat_vec_f16_1(
+    const __fp16 * src0,
+    const __fp16 * src1,
+           float * dst,
+    int nrows,
+    int ncols) {
+
+    const int n32 = ncols & ~31;
+
+    for (int r = 0; r < nrows; r++) {
+        float sumf = 0.0;
+
+        float16x8_t sum0 = vdupq_n_f16(0.0f);
+        float16x8_t sum1 = vdupq_n_f16(0.0f);
+        float16x8_t sum2 = vdupq_n_f16(0.0f);
+        float16x8_t sum3 = vdupq_n_f16(0.0f);
+
+        float16x8_t x0, x1, x2, x3;
+        float16x8_t y0, y1, y2, y3;
+
+        const __fp16 * restrict p0 = src0 + r*ncols;
+
+        for (int i = 0; i < n32; i += 32) {
+            x0 = vld1q_f16(p0 + i + 0 );
+            x1 = vld1q_f16(p0 + i + 8 );
+            x2 = vld1q_f16(p0 + i + 16);
+            x3 = vld1q_f16(p0 + i + 24);
+
+            y0 = vld1q_f16(src1 + i + 0 );
+            y1 = vld1q_f16(src1 + i + 8 );
+            y2 = vld1q_f16(src1 + i + 16);
+            y3 = vld1q_f16(src1 + i + 24);
+
+            sum0 = vfmaq_f16(sum0, x0, y0);
+            sum1 = vfmaq_f16(sum1, x1, y1);
+            sum2 = vfmaq_f16(sum2, x2, y2);
+            sum3 = vfmaq_f16(sum3, x3, y3);
+        }
+
+        // reduce sum0..sum3 to sum0
+        sum0 = vaddq_f16(sum0, sum1);
+        sum2 = vaddq_f16(sum2, sum3);
+        sum0 = vaddq_f16(sum0, sum2);
+
+        // load sum0 into 2 float32x4_t
+        float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
+        float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
+
+        // reduce sum0f32 and sum1f32 to sumf
+        sum0f32 = vaddq_f32(sum0f32, sum1f32);
+
+        float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
+        sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
+
+        //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7];
+
+        for (int j = n32; j < n32; j++) {
+            sumf += src0[r*ncols + j]*src1[j];
+        }
+
+        dst[r] = sumf;
+    }
+}
+
+uint64_t get_time_us(void) {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+int main(int argc, const char ** argv) {
+    float * src0 = malloc(sizeof(float)*N*M);
+    float * src1 = malloc(sizeof(float)*M);
+    float * dst  = malloc(sizeof(float)*N);
+
+    //float * src0 = (float *)(aligned_alloc(64, sizeof(float)*N*M));
+    //float * src1 = (float *)(aligned_alloc(64, sizeof(float)*M));
+    //float * dst  = (float *)(aligned_alloc(64, sizeof(float)*N));
+
+    for (int i = 0; i < N*M; i++) {
+        src0[i] = rand() / (float)RAND_MAX;
+    }
+
+    for (int i = 0; i < M; i++) {
+        src1[i] = rand() / (float)RAND_MAX;
+    }
+
+    // convert src0 and src1 to __fp16
+    __fp16 * src0_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*N*M));
+    __fp16 * src1_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*M));
+
+    {
+        const uint64_t t_start = get_time_us();
+
+        for (int i = 0; i < N*M; i++) {
+            src0_fp16[i] = src0[i];
+            //printf("%f %f\n", src0[i], src0_fp16[i]);
+            //assert(!isnan(src0_fp16[i]));
+        }
+
+        for (int i = 0; i < M; i++) {
+            src1_fp16[i] = src1[i];
+        }
+
+        const uint64_t t_end = get_time_us();
+        printf("convert time: %f ms\n", (t_end - t_start) / 1000.0);
+    }
+
+    for (int i = 0; i < 16; ++i) {
+        printf("%f %f\n", src0[i], src0_fp16[i]);
+    }
+
+    int method = 0;
+    if (argc > 1) {
+        method = atoi(argv[1]);
+    }
+
+    const int nIter = 1000;
+
+    const clock_t start = clock();
+    const uint64_t start_us = get_time_us();
+
+    double iM = 1.0/M;
+    double sum = 0.0f;
+    for (int i = 0; i < nIter; i++) {
+        if (method == 0) {
+            mul_mat_vec_f32_0(src0, src1, dst, N, M);
+        }
+
+        if (method == 1) {
+            mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, N, M);
+        }
+
+        if (method == 2) {
+            mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, N, M);
+        }
+    }
+
+    for (int i = 0; i < N; i++) {
+        sum += dst[i]*iM;
+    }
+
+    {
+        const clock_t end = clock();
+        const uint64_t end_us = get_time_us();
+        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
+        printf("%s: elapsed us:    %llu / %f ms\n",  __func__, end_us - start_us, (end_us - start_us) / 1000.0 / nIter);
+    }
+
+    printf("%f\n", sum);
+
+    free(src0);
+    free(src1);
+    free(dst);
+
+    free(src0_fp16);
+    free(src1_fp16);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test-xpos.c b/stable-diffusion.cpp/ggml/tests/test-xpos.c
new file mode 100644
index 0000000000000000000000000000000000000000..9db47d9bcc83204f2361470818d8a650f70b8e2f
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test-xpos.c
@@ -0,0 +1,93 @@
+#include "ggml/ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+bool is_close(float a, float b, float epsilon) {
+    return fabs(a - b) < epsilon;
+}
+
+int main(int argc, char ** argv) {
+    const int n_threads = 1;
+    const int n_embd_head = 4; // aka head_dim
+    const int n_head = 1;
+    const int N = 8;
+
+    struct ggml_init_params params = {
+        .mem_size   = 16*1024*1024,
+        .mem_buffer = NULL,
+    };
+
+    // memory allocation happens here
+    struct ggml_context * ctx = ggml_init(params);
+
+    struct ggml_tensor * Q = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, N);
+    struct ggml_tensor * K = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, N);
+
+    for (int i = 0; i < ggml_nelements(Q); i++) {
+        ((float*) Q->data)[i] = 2.0f;
+        ((float*) K->data)[i] = 2.0f;
+    }
+
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    int * data = (int *) KQ_pos->data;
+    for (int i = 0; i < N; ++i) {
+        data[i] = 1 + i;
+    }
+
+    struct ggml_tensor * Qx = ggml_rope_xpos_inplace(ctx, Q, KQ_pos, n_embd_head, 512.0f, false);
+    struct ggml_tensor * Kx = ggml_rope_xpos_inplace(ctx, K, KQ_pos, n_embd_head, 512.0f, true);
+
+    struct ggml_cgraph gf = ggml_build_forward(Qx);
+    ggml_build_forward_expand(&gf, Kx);
+    ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
+
+	// expected output for Qx:
+    // -0.6009  2.7568  1.9782  2.0182
+    // -2.6379  0.9815  1.9562  2.0361
+    // -2.2457 -1.6853  1.9341  2.0538
+    //  0.2043 -2.7934  1.9118  2.0712
+    //  2.4550 -1.3341  1.8894  2.0884
+    //  2.4430  1.3417  1.8668  2.1054
+    //  0.1905  2.7739  1.8440  2.1221
+    // -2.2257  1.6550  1.8212  2.1386
+
+    for (int i = 0; i < ggml_nelements(Q); i++) {
+        if (((float*) Qx->data)[i] > 0) printf(" ");
+        printf("%.4f ", ((float*) Qx->data)[i]);
+        if ((i+1) % n_embd_head == 0) printf("\n");
+    }
+    printf("\n");
+
+    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 0], -2.2257f, 0.0001f));
+    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 1],  1.6550f, 0.0001f));
+    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 2],  1.8212f, 0.0001f));
+    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 3],  2.1386f, 0.0001f));
+
+    // expected output for Kx:
+	// -0.6038  2.7703  1.9816  2.0216
+    // -2.6639  0.9911  1.9630  2.0431
+    // -2.2789 -1.7103  1.9441  2.0644
+    //  0.2083 -2.8486  1.9251  2.0856
+    //  2.5158 -1.3671  1.9057  2.1065
+    //  2.5158  1.3816  1.8862  2.1273
+    //  0.1972  2.8705  1.8665  2.1479
+    // -2.3146  1.7211  1.8465  2.1684
+
+    for (int i = 0; i < ggml_nelements(K); i++) {
+        if (((float*) Kx->data)[i] > 0) printf(" ");
+        printf("%.4f ", ((float*) Kx->data)[i]);
+        if ((i+1) % n_embd_head == 0) printf("\n");
+    }
+    printf("\n");
+
+    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 0], -2.3146f, 0.0001f));
+    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 1],  1.7211f, 0.0001f));
+    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 2],  1.8465f, 0.0001f));
+    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 3],  2.1684f, 0.0001f));
+
+    ggml_free(ctx);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test0.c b/stable-diffusion.cpp/ggml/tests/test0.c
new file mode 100644
index 0000000000000000000000000000000000000000..7fba63e77800d671f8aefa5e79eefc6b267d9e89
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test0.c
@@ -0,0 +1,42 @@
+#include "ggml/ggml.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * t1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 10);
+    struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx0, GGML_TYPE_I16, 10, 20);
+    struct ggml_tensor * t3 = ggml_new_tensor_3d(ctx0, GGML_TYPE_I32, 10, 20, 30);
+
+    GGML_ASSERT(t1->n_dims == 1);
+    GGML_ASSERT(t1->ne[0]  == 10);
+    GGML_ASSERT(t1->nb[1]  == 10*sizeof(float));
+
+    GGML_ASSERT(t2->n_dims == 2);
+    GGML_ASSERT(t2->ne[0]  == 10);
+    GGML_ASSERT(t2->ne[1]  == 20);
+    GGML_ASSERT(t2->nb[1]  == 10*sizeof(int16_t));
+    GGML_ASSERT(t2->nb[2]  == 10*20*sizeof(int16_t));
+
+    GGML_ASSERT(t3->n_dims == 3);
+    GGML_ASSERT(t3->ne[0]  == 10);
+    GGML_ASSERT(t3->ne[1]  == 20);
+    GGML_ASSERT(t3->ne[2]  == 30);
+    GGML_ASSERT(t3->nb[1]  == 10*sizeof(int32_t));
+    GGML_ASSERT(t3->nb[2]  == 10*20*sizeof(int32_t));
+    GGML_ASSERT(t3->nb[3]  == 10*20*30*sizeof(int32_t));
+
+    ggml_print_objects(ctx0);
+
+    ggml_free(ctx0);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test0.zig b/stable-diffusion.cpp/ggml/tests/test0.zig
new file mode 100644
index 0000000000000000000000000000000000000000..01bd6015454b6047d6113b91130b4cc75a25c7fd
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test0.zig
@@ -0,0 +1,41 @@
+const std = @import("std");
+const c = @cImport({
+    @cInclude("ggml/ggml.h");
+});
+
+pub fn main() !void {
+    const params = .{
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = null,
+        .no_alloc   = false,
+    };
+
+    const ctx0 = c.ggml_init(params);
+    defer c.ggml_free(ctx0);
+
+    const t1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 10);
+    const t2 = c.ggml_new_tensor_2d(ctx0, c.GGML_TYPE_I16, 10, 20);
+    const t3 = c.ggml_new_tensor_3d(ctx0, c.GGML_TYPE_I32, 10, 20, 30);
+
+    try std.testing.expect(t1.*.n_dims == 1);
+    try std.testing.expect(t1.*.ne[0]  == 10);
+    try std.testing.expect(t1.*.nb[1]  == 10*@sizeOf(f32));
+
+    try std.testing.expect(t2.*.n_dims == 2);
+    try std.testing.expect(t2.*.ne[0]  == 10);
+    try std.testing.expect(t2.*.ne[1]  == 20);
+    try std.testing.expect(t2.*.nb[1]  == 10*@sizeOf(i16));
+    try std.testing.expect(t2.*.nb[2]  == 10*20*@sizeOf(i16));
+
+    try std.testing.expect(t3.*.n_dims == 3);
+    try std.testing.expect(t3.*.ne[0]  == 10);
+    try std.testing.expect(t3.*.ne[1]  == 20);
+    try std.testing.expect(t3.*.ne[2]  == 30);
+    try std.testing.expect(t3.*.nb[1]  == 10*@sizeOf(i32));
+    try std.testing.expect(t3.*.nb[2]  == 10*20*@sizeOf(i32));
+    try std.testing.expect(t3.*.nb[3]  == 10*20*30*@sizeOf(i32));
+
+    c.ggml_print_objects(ctx0);
+
+    _ = try std.io.getStdIn().reader().readByte();
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test1.c b/stable-diffusion.cpp/ggml/tests/test1.c
new file mode 100644
index 0000000000000000000000000000000000000000..c313bf8e1b662a960cab28f1f397a90b9972943b
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test1.c
@@ -0,0 +1,438 @@
+#include "ggml/ggml.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, const char ** argv) {
+    const int n_threads = 2;
+
+    struct ggml_init_params params = {
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    {
+        struct ggml_tensor * x = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+
+        ggml_set_param(ctx0, x);
+
+        struct ggml_tensor * a = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        struct ggml_tensor * b = ggml_mul(ctx0, x, x);
+        struct ggml_tensor * f = ggml_mul(ctx0, b, a);
+
+        // a*x^2
+        // 2*a*x
+
+        ggml_print_objects(ctx0);
+
+        struct ggml_cgraph gf = ggml_build_forward(f);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_set_f32(x, 2.0f);
+        ggml_set_f32(a, 3.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(f->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("f     = %f\n", ggml_get_f32_1d(f, 0));
+        printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0));
+
+        GGML_ASSERT(ggml_get_f32_1d(f, 0)       == 12.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x->grad, 0) == 12.0f);
+
+        ggml_set_f32(x, 3.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(f->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("f     = %f\n", ggml_get_f32_1d(f, 0));
+        printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0));
+
+        GGML_ASSERT(ggml_get_f32_1d(f, 0)       == 27.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x->grad, 0) == 18.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-1-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-1-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        struct ggml_tensor * x3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+
+        ggml_set_f32(x1, 3.0f);
+        ggml_set_f32(x2, 1.0f);
+        ggml_set_f32(x3, 0.0f);
+
+        ggml_set_param(ctx0, x1);
+        ggml_set_param(ctx0, x2);
+
+        struct ggml_tensor * y = ggml_add(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x1, x2));
+
+        struct ggml_cgraph gf = ggml_build_forward(y);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0));
+        printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 12.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 7.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f);
+
+        struct ggml_tensor * g1 = x1->grad;
+        struct ggml_tensor * g2 = x2->grad;
+
+        struct ggml_cgraph gbb = ggml_build_backward(ctx0, &gb, true);
+
+        ggml_graph_reset(&gb);
+        ggml_set_f32(g1->grad, 1.0f);
+        ggml_set_f32(g2->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gbb, n_threads);
+
+        printf("H * [1, 1] = [ %f %f ]\n", ggml_get_f32_1d(x1->grad, 0), ggml_get_f32_1d(x2->grad, 0));
+
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 3.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 1.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-2-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-2-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+
+        ggml_set_param(ctx0, x1);
+        ggml_set_param(ctx0, x2);
+
+        struct ggml_tensor * y = ggml_mul(ctx0, ggml_add(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x1, x2)), x1);
+
+        struct ggml_cgraph gf = ggml_build_forward(y);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_set_f32(x1, 3.0f);
+        ggml_set_f32(x2, 4.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0));
+        printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 63.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 51.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 9.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-3-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-3-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        struct ggml_tensor * x3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+
+        ggml_set_param(ctx0, x1);
+        ggml_set_param(ctx0, x2);
+        ggml_set_param(ctx0, x3);
+
+        struct ggml_tensor * y = ggml_mul(ctx0, ggml_mul(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x2, x2)), x3);
+
+        struct ggml_cgraph gf = ggml_build_forward(y);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_set_f32(x1, 1.0f);
+        ggml_set_f32(x2, 2.0f);
+        ggml_set_f32(x3, 3.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0));
+        printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0));
+        printf("df/dx3 = %f\n", ggml_get_f32_1d(x3->grad, 0));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 12.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 24.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 12.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x3->grad, 0) == 4.0f);
+
+        struct ggml_tensor * g1 = x1->grad;
+        struct ggml_tensor * g2 = x2->grad;
+        struct ggml_tensor * g3 = x3->grad;
+
+        struct ggml_cgraph gbb = ggml_build_backward(ctx0, &gb, true);
+
+        ggml_graph_reset(&gb);
+        ggml_set_f32(g1->grad, 1.0f);
+        ggml_set_f32(g2->grad, 1.0f);
+        ggml_set_f32(g3->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gbb, n_threads);
+
+        printf("H * [1, 1, 1] = [ %f %f %f ]\n",
+                ggml_get_f32_1d(x1->grad, 0),
+                ggml_get_f32_1d(x2->grad, 0),
+                ggml_get_f32_1d(x3->grad, 0));
+
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 56.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 34.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x3->grad, 0) == 12.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-4-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-4-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+
+        ggml_set_param(ctx0, x1);
+        ggml_set_param(ctx0, x2);
+
+        struct ggml_tensor * y = ggml_sum(ctx0, ggml_mul(ctx0, x1, x2));
+
+        struct ggml_cgraph gf = ggml_build_forward(y);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_set_f32(x1, 3.0f);
+        ggml_set_f32(x2, 5.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f %f %f\n",
+                ggml_get_f32_1d(x1->grad, 0),
+                ggml_get_f32_1d(x1->grad, 1),
+                ggml_get_f32_1d(x1->grad, 2));
+        printf("df/dx2 = %f %f %f\n",
+                ggml_get_f32_1d(x2->grad, 0),
+                ggml_get_f32_1d(x2->grad, 1),
+                ggml_get_f32_1d(x2->grad, 2));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 45.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 5.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == 5.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == 3.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == 5.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == 3.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-5-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-5-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+
+        ggml_set_param(ctx0, x1);
+        ggml_set_param(ctx0, x2);
+
+        struct ggml_tensor * y =
+            ggml_sum(ctx0,
+                    ggml_add(ctx0,
+                        ggml_mul(ctx0, x1, x2),
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, ggml_new_f32(ctx0, -2.0f), x1),
+                            ggml_mul(ctx0, x1, x1)
+                            )
+                        )
+                    );
+
+        struct ggml_cgraph gf = ggml_build_forward(y);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_set_f32(x1, 3.0f);
+        ggml_set_f32(x2, 5.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f %f %f\n",
+                ggml_get_f32_1d(x1->grad, 0),
+                ggml_get_f32_1d(x1->grad, 1),
+                ggml_get_f32_1d(x1->grad, 2));
+        printf("df/dx2 = %f %f %f\n",
+                ggml_get_f32_1d(x2->grad, 0),
+                ggml_get_f32_1d(x2->grad, 1),
+                ggml_get_f32_1d(x2->grad, 2));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)              == -9.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == -7.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == -7.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == -7.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) ==  3.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) ==  3.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) ==  3.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-6-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-6-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+
+        ggml_set_param(ctx0, x1);
+        ggml_set_param(ctx0, x2);
+
+        struct ggml_tensor * y =
+            ggml_sum(ctx0,
+                    ggml_sub(ctx0,
+                        ggml_mul(ctx0, x1, x2),
+                        ggml_mul(ctx0,
+                            ggml_mul(ctx0, x1, x1),
+                            ggml_repeat(ctx0, ggml_new_f32(ctx0, -2.0f), x1)
+                            )
+                        )
+                    );
+
+        struct ggml_cgraph gf = ggml_build_forward(y);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_set_f32(x1, 3.0f);
+        ggml_set_f32(x2, 5.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f %f %f\n",
+                ggml_get_f32_1d(x1->grad, 0),
+                ggml_get_f32_1d(x1->grad, 1),
+                ggml_get_f32_1d(x1->grad, 2));
+        printf("df/dx2 = %f %f %f\n",
+                ggml_get_f32_1d(x2->grad, 0),
+                ggml_get_f32_1d(x2->grad, 1),
+                ggml_get_f32_1d(x2->grad, 2));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 99.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 17.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == 17.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == 17.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) ==  3.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) ==  3.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) ==  3.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-7-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-7-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
+
+        ggml_set_param(ctx0, x1);
+        ggml_set_param(ctx0, x2);
+
+        struct ggml_tensor * y =
+            ggml_abs(ctx0,
+                    ggml_sub(ctx0, x1, x2)
+                    );
+
+        struct ggml_cgraph gf = ggml_build_forward(y);
+        struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+        ggml_set_f32(x1, 3.0f);
+        ggml_set_f32(x2, 5.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f %f %f\n",
+                ggml_get_f32_1d(x1->grad, 0),
+                ggml_get_f32_1d(x1->grad, 1),
+                ggml_get_f32_1d(x1->grad, 2));
+        printf("df/dx2 = %f %f %f\n",
+                ggml_get_f32_1d(x2->grad, 0),
+                ggml_get_f32_1d(x2->grad, 1),
+                ggml_get_f32_1d(x2->grad, 2));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)        ==  2.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == -1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == -1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == -1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) ==  1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) ==  1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) ==  1.0f);
+
+        ggml_set_f32(x1, 7.0f);
+        ggml_set_f32(x2, 5.0f);
+
+        ggml_graph_reset(&gf);
+        ggml_set_f32(y->grad, 1.0f);
+
+        ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
+        printf("df/dx1 = %f %f %f\n",
+                ggml_get_f32_1d(x1->grad, 0),
+                ggml_get_f32_1d(x1->grad, 1),
+                ggml_get_f32_1d(x1->grad, 2));
+        printf("df/dx2 = %f %f %f\n",
+                ggml_get_f32_1d(x2->grad, 0),
+                ggml_get_f32_1d(x2->grad, 1),
+                ggml_get_f32_1d(x2->grad, 2));
+
+        GGML_ASSERT(ggml_get_f32_1d(y, 0)        ==  2.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) ==  1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) ==  1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) ==  1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == -1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == -1.0f);
+        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == -1.0f);
+
+        ggml_graph_dump_dot(&gf, NULL, "test1-8-forward.dot");
+        ggml_graph_dump_dot(&gb, &gf,  "test1-8-backward.dot");
+    }
+
+    ggml_free(ctx0);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test1.zig b/stable-diffusion.cpp/ggml/tests/test1.zig
new file mode 100644
index 0000000000000000000000000000000000000000..e472e00054025c2166196b6b218ce8e4171e9efc
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test1.zig
@@ -0,0 +1,459 @@
+const std = @import("std");
+const c = @cImport({
+    @cInclude("ggml/ggml.h");
+});
+
+pub fn main() !void {
+    const n_threads = 2;
+
+    const params = .{
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = null,
+        .no_alloc   = false,
+    };
+
+    const ctx0 = c.ggml_init(params);
+    defer c.ggml_free(ctx0);
+
+    {
+        const x = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+
+        c.ggml_set_param(ctx0, x);
+
+        const a = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+        const b = c.ggml_mul(ctx0, x, x);
+        const f = c.ggml_mul(ctx0, b, a);
+
+        // a*x^2
+        // 2*a*x
+
+        c.ggml_print_objects(ctx0);
+
+        const gf = c.ggml_build_forward(f);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        _ = c.ggml_set_f32(x, 2.0);
+        _ = c.ggml_set_f32(a, 3.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(f.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("f     = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)});
+        std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)});
+
+        try std.testing.expect(c.ggml_get_f32_1d(f, 0)          ==  12.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x.*.grad, 0)   ==  12.0);
+
+        _ = c.ggml_set_f32(x, 3.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(f.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("f     = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)});
+        std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)});
+
+        try std.testing.expect(c.ggml_get_f32_1d(f, 0)          ==  27.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x.*.grad, 0)   ==  18.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-1-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-1-backward.dot");
+    }
+
+    /////////////////////////////////////////////////////////////
+
+    {
+        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+        const x3 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+
+        _ = c.ggml_set_f32(x1, 3.0);
+        _ = c.ggml_set_f32(x2, 1.0);
+        _ = c.ggml_set_f32(x3, 0.0);
+
+        c.ggml_set_param(ctx0, x1);
+        c.ggml_set_param(ctx0, x2);
+
+        const y = c.ggml_add(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x1, x2));
+
+        const gf = c.ggml_build_forward(y);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)});
+        std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)});
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  12.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  7.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
+
+        const g1 = x1.*.grad;
+        const g2 = x2.*.grad;
+
+        const gbb = c.ggml_build_backward(ctx0, @constCast(&gb), true);
+
+        c.ggml_graph_reset(@constCast(&gb));
+        _ = c.ggml_set_f32(g1.*.grad, 1.0);
+        _ = c.ggml_set_f32(g2.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads);
+
+        std.debug.print("H * [1, 1] = [ {d:.6} {d:.6} ]\n", .{c.ggml_get_f32_1d(x1.*.grad, 0), c.ggml_get_f32_1d(x2.*.grad, 0)});
+
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  3.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  1.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-2-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-2-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+
+        c.ggml_set_param(ctx0, x1);
+        c.ggml_set_param(ctx0, x2);
+
+        const y = c.ggml_mul(ctx0, c.ggml_add(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x1, x2)), x1);
+
+        const gf = c.ggml_build_forward(y);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        _ = c.ggml_set_f32(x1, 3.0);
+        _ = c.ggml_set_f32(x2, 4.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)});
+        std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)});
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  63.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  51.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  9.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-3-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-3-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+        const x3 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
+
+        c.ggml_set_param(ctx0, x1);
+        c.ggml_set_param(ctx0, x2);
+        c.ggml_set_param(ctx0, x3);
+
+        const y = c.ggml_mul(ctx0, c.ggml_mul(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x2, x2)), x3);
+
+        const gf = c.ggml_build_forward(y);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        _ = c.ggml_set_f32(x1, 1.0);
+        _ = c.ggml_set_f32(x2, 2.0);
+        _ = c.ggml_set_f32(x3, 3.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)});
+        std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)});
+        std.debug.print("df/dx3 = {d:.6}\n", .{c.ggml_get_f32_1d(x3.*.grad, 0)});
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  12.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  24.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  12.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x3.*.grad, 0)  ==  4.0);
+
+        const g1 = x1.*.grad;
+        const g2 = x2.*.grad;
+        const g3 = x3.*.grad;
+
+        const gbb = c.ggml_build_backward(ctx0, @constCast(&gb), true);
+
+        c.ggml_graph_reset(@constCast(&gb));
+        _ = c.ggml_set_f32(g1.*.grad, 1.0);
+        _ = c.ggml_set_f32(g2.*.grad, 1.0);
+        _ = c.ggml_set_f32(g3.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads);
+
+        std.debug.print("H * [1, 1, 1] = [ {d:.6} {d:.6} {d:.6}]\n",
+            .{
+                c.ggml_get_f32_1d(x1.*.grad, 0),
+                c.ggml_get_f32_1d(x2.*.grad, 0),
+                c.ggml_get_f32_1d(x3.*.grad, 0),
+            });
+
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  56.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  34.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x3.*.grad, 0)  ==  12.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-4-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-4-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+
+        c.ggml_set_param(ctx0, x1);
+        c.ggml_set_param(ctx0, x2);
+
+        const y = c.ggml_sum(ctx0, c.ggml_mul(ctx0, x1, x2));
+
+        const gf = c.ggml_build_forward(y);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        _ = c.ggml_set_f32(x1, 3.0);
+        _ = c.ggml_set_f32(x2, 5.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x1.*.grad, 0),
+                c.ggml_get_f32_1d(x1.*.grad, 1),
+                c.ggml_get_f32_1d(x1.*.grad, 2),
+            });
+        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x2.*.grad, 0),
+                c.ggml_get_f32_1d(x2.*.grad, 1),
+                c.ggml_get_f32_1d(x2.*.grad, 2),
+            });
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  45.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  5.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  5.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  3.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  5.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  3.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-5-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-5-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+
+        c.ggml_set_param(ctx0, x1);
+        c.ggml_set_param(ctx0, x2);
+
+        const y =
+            c.ggml_sum(ctx0,
+                    c.ggml_add(ctx0,
+                        c.ggml_mul(ctx0, x1, x2),
+                        c.ggml_mul(ctx0,
+                            c.ggml_repeat(ctx0, c.ggml_new_f32(ctx0, -2.0), x1),
+                            c.ggml_mul(ctx0, x1, x1)
+                            )
+                        )
+                    );
+
+        const gf = c.ggml_build_forward(y);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        _ = c.ggml_set_f32(x1, 3.0);
+        _ = c.ggml_set_f32(x2, 5.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x1.*.grad, 0),
+                c.ggml_get_f32_1d(x1.*.grad, 1),
+                c.ggml_get_f32_1d(x1.*.grad, 2),
+            });
+        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x2.*.grad, 0),
+                c.ggml_get_f32_1d(x2.*.grad, 1),
+                c.ggml_get_f32_1d(x2.*.grad, 2),
+            });
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  -9.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  -7.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  -7.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  -7.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  3.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  3.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-6-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-6-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+
+        c.ggml_set_param(ctx0, x1);
+        c.ggml_set_param(ctx0, x2);
+
+        const y =
+            c.ggml_sum(ctx0,
+                    c.ggml_sub(ctx0,
+                        c.ggml_mul(ctx0, x1, x2),
+                        c.ggml_mul(ctx0,
+                            c.ggml_mul(ctx0, x1, x1),
+                            c.ggml_repeat(ctx0, c.ggml_new_f32(ctx0, -2.0), x1)
+                            )
+                        )
+                    );
+
+        const gf = c.ggml_build_forward(y);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        _ = c.ggml_set_f32(x1, 3.0);
+        _ = c.ggml_set_f32(x2, 5.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x1.*.grad, 0),
+                c.ggml_get_f32_1d(x1.*.grad, 1),
+                c.ggml_get_f32_1d(x1.*.grad, 2),
+            });
+        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x2.*.grad, 0),
+                c.ggml_get_f32_1d(x2.*.grad, 1),
+                c.ggml_get_f32_1d(x2.*.grad, 2),
+            });
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  99.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  17.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  17.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  17.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  3.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  3.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-7-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-7-backward.dot");
+    }
+
+    ///////////////////////////////////////////////////////////////
+
+    {
+        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
+
+        c.ggml_set_param(ctx0, x1);
+        c.ggml_set_param(ctx0, x2);
+
+        const y =
+            c.ggml_abs(ctx0,
+                    c.ggml_sub(ctx0, x1, x2)
+                    );
+
+        const gf = c.ggml_build_forward(y);
+        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
+
+        _ = c.ggml_set_f32(x1, 3.0);
+        _ = c.ggml_set_f32(x2, 5.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x1.*.grad, 0),
+                c.ggml_get_f32_1d(x1.*.grad, 1),
+                c.ggml_get_f32_1d(x1.*.grad, 2),
+            });
+        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x2.*.grad, 0),
+                c.ggml_get_f32_1d(x2.*.grad, 1),
+                c.ggml_get_f32_1d(x2.*.grad, 2),
+            });
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  2.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  -1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  -1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  -1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  1.0);
+
+        _ = c.ggml_set_f32(x1, 7.0);
+        _ = c.ggml_set_f32(x2, 5.0);
+
+        c.ggml_graph_reset(@constCast(&gf));
+        _ = c.ggml_set_f32(y.*.grad, 1.0);
+
+        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
+
+        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
+        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x1.*.grad, 0),
+                c.ggml_get_f32_1d(x1.*.grad, 1),
+                c.ggml_get_f32_1d(x1.*.grad, 2),
+            });
+        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
+            .{
+                c.ggml_get_f32_1d(x2.*.grad, 0),
+                c.ggml_get_f32_1d(x2.*.grad, 1),
+                c.ggml_get_f32_1d(x2.*.grad, 2),
+            });
+
+        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  2.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  -1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  -1.0);
+        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  -1.0);
+
+        c.ggml_graph_dump_dot(&gf, null, "test1-8-forward.dot");
+        c.ggml_graph_dump_dot(&gb, &gf,  "test1-8-backward.dot");
+    }
+
+    _ = try std.io.getStdIn().reader().readByte();
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test2.c b/stable-diffusion.cpp/ggml/tests/test2.c
new file mode 100644
index 0000000000000000000000000000000000000000..839e3e6de91d6b874606e0637d34925ec6ff6dda
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test2.c
@@ -0,0 +1,181 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#include "ggml/ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+bool is_close(float a, float b, float epsilon) {
+    return fabs(a - b) < epsilon;
+}
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
+    //opt_params.adam.alpha = 0.01f;
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_LBFGS);
+
+    // original threads: 8
+    int nthreads = 8;
+    const char *env = getenv("GGML_NTHREADS");
+    if (env != NULL) {
+        nthreads = atoi(env);
+    }
+    if (argc > 1) {
+        nthreads = atoi(argv[1]);
+    }
+    opt_params.n_threads = nthreads;
+    printf("test2: n_threads:%d\n", opt_params.n_threads);
+
+    const float xi[] = {  1.0f,  2.0f,  3.0f,  4.0f,  5.0f , 6.0f,  7.0f,  8.0f,  9.0f,  10.0f, };
+          float yi[] = { 15.0f, 25.0f, 35.0f, 45.0f, 55.0f, 65.0f, 75.0f, 85.0f, 95.0f, 105.0f, };
+
+    const int n = sizeof(xi)/sizeof(xi[0]);
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * x = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n);
+    struct ggml_tensor * y = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n);
+
+    for (int i = 0; i < n; i++) {
+        ((float *) x->data)[i] = xi[i];
+        ((float *) y->data)[i] = yi[i];
+    }
+
+    {
+        struct ggml_tensor * t0 = ggml_new_f32(ctx0, 0.0f);
+        struct ggml_tensor * t1 = ggml_new_f32(ctx0, 0.0f);
+
+        // initialize auto-diff parameters:
+        ggml_set_param(ctx0, t0);
+        ggml_set_param(ctx0, t1);
+
+        // f = sum_i[(t0 + t1*x_i - y_i)^2]/(2n)
+        struct ggml_tensor * f =
+            ggml_div(ctx0,
+                    ggml_sum(ctx0,
+                        ggml_sqr(ctx0,
+                            ggml_sub(ctx0,
+                                ggml_add(ctx0,
+                                    ggml_mul(ctx0, x, ggml_repeat(ctx0, t1, x)),
+                                    ggml_repeat(ctx0, t0, x)),
+                                y)
+                            )
+                        ),
+                    ggml_new_f32(ctx0, 2.0f*n));
+
+        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
+
+        printf("t0 = %f\n", ggml_get_f32_1d(t0, 0));
+        printf("t1 = %f\n", ggml_get_f32_1d(t1, 0));
+
+        GGML_ASSERT(res == GGML_OPT_OK);
+
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0),  5.0f, 1e-3f));
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-3f));
+    }
+
+    {
+        struct ggml_tensor * t0 = ggml_new_f32(ctx0, -1.0f);
+        struct ggml_tensor * t1 = ggml_new_f32(ctx0,  9.0f);
+
+        ggml_set_param(ctx0, t0);
+        ggml_set_param(ctx0, t1);
+
+        // f = 0.5*sum_i[abs(t0 + t1*x_i - y_i)]/n
+        struct ggml_tensor * f =
+            ggml_mul(ctx0,
+                    ggml_new_f32(ctx0, 1.0/(2*n)),
+                    ggml_sum(ctx0,
+                        ggml_abs(ctx0,
+                            ggml_sub(ctx0,
+                                ggml_add(ctx0,
+                                    ggml_mul(ctx0, x, ggml_repeat(ctx0, t1, x)),
+                                    ggml_repeat(ctx0, t0, x)),
+                                y)
+                            )
+                        )
+                    );
+
+
+        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
+
+        GGML_ASSERT(res == GGML_OPT_OK);
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0),  5.0f, 1e-2f));
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-2f));
+    }
+
+    {
+        struct ggml_tensor * t0 = ggml_new_f32(ctx0,  5.0f);
+        struct ggml_tensor * t1 = ggml_new_f32(ctx0, -4.0f);
+
+        ggml_set_param(ctx0, t0);
+        ggml_set_param(ctx0, t1);
+
+        // f = t0^2 + t1^2
+        struct ggml_tensor * f =
+            ggml_add(ctx0,
+                    ggml_sqr(ctx0, t0),
+                    ggml_sqr(ctx0, t1)
+                    );
+
+        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
+
+        GGML_ASSERT(res == GGML_OPT_OK);
+        GGML_ASSERT(is_close(ggml_get_f32_1d(f,  0), 0.0f, 1e-3f));
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 0.0f, 1e-3f));
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 0.0f, 1e-3f));
+    }
+
+    /////////////////////////////////////////
+
+    {
+        struct ggml_tensor * t0 = ggml_new_f32(ctx0, -7.0f);
+        struct ggml_tensor * t1 = ggml_new_f32(ctx0,  8.0f);
+
+        ggml_set_param(ctx0, t0);
+        ggml_set_param(ctx0, t1);
+
+        // f = (t0 + 2*t1 - 7)^2 + (2*t0 + t1 - 5)^2
+        struct ggml_tensor * f =
+            ggml_add(ctx0,
+                    ggml_sqr(ctx0,
+                        ggml_sub(ctx0,
+                            ggml_add(ctx0,
+                                t0,
+                                ggml_mul(ctx0, t1, ggml_new_f32(ctx0, 2.0f))),
+                            ggml_new_f32(ctx0, 7.0f)
+                            )
+                        ),
+                    ggml_sqr(ctx0,
+                        ggml_sub(ctx0,
+                            ggml_add(ctx0,
+                                ggml_mul(ctx0, t0, ggml_new_f32(ctx0, 2.0f)),
+                                t1),
+                            ggml_new_f32(ctx0, 5.0f)
+                            )
+                        )
+                    );
+
+        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
+
+        GGML_ASSERT(res == GGML_OPT_OK);
+        GGML_ASSERT(is_close(ggml_get_f32_1d(f,  0), 0.0f, 1e-3f));
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 1.0f, 1e-3f));
+        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 3.0f, 1e-3f));
+    }
+
+    ggml_free(ctx0);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test2.zig b/stable-diffusion.cpp/ggml/tests/test2.zig
new file mode 100644
index 0000000000000000000000000000000000000000..974de0d662aa4d8a1116936fe31c50fffb345bcc
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test2.zig
@@ -0,0 +1,165 @@
+const std = @import("std");
+const Thread = std.Thread;
+const c = @cImport({
+    @cInclude("ggml/ggml.h");
+});
+
+fn is_close(a: f32, b: f32, epsilon: f32) bool {
+    return std.math.fabs(a - b) < epsilon;
+}
+
+pub fn main() !void {
+    const params = .{
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = null,
+        .no_alloc   = false,
+    };
+
+    var opt_params = c.ggml_opt_default_params(c.GGML_OPT_LBFGS);
+    
+    const nthreads = try Thread.getCpuCount();
+    opt_params.n_threads = @intCast(nthreads);
+    std.debug.print("test2: n_threads:{}\n", .{opt_params.n_threads});
+
+    const xi = [_]f32{  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0 };
+    const yi = [_]f32{ 15.0, 25.0, 35.0, 45.0, 55.0, 65.0, 75.0, 85.0, 95.0, 105.0 };
+
+    const n = xi.len;
+
+    const ctx0 = c.ggml_init(params);
+    defer c.ggml_free(ctx0);
+
+    const x = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, n);
+    const y = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, n);
+
+    for (0..n) |i| {
+        const x_data_pointer: [*]f32 = @ptrCast(@alignCast(x.*.data));
+        x_data_pointer[i] = xi[i];
+        const y_data_pointer: [*]f32 = @ptrCast(@alignCast(y.*.data));
+        y_data_pointer[i] = yi[i];
+    }
+
+    {
+        const t0 = c.ggml_new_f32(ctx0, 0.0);
+        const t1 = c.ggml_new_f32(ctx0, 0.0);
+
+        // initialize auto-diff parameters:
+        _ = c.ggml_set_param(ctx0, t0);
+        _ = c.ggml_set_param(ctx0, t1);
+
+        // f = sum_i[(t0 + t1*x_i - y_i)^2]/(2n)
+        const f =
+            c.ggml_div(ctx0,
+                    c.ggml_sum(ctx0,
+                        c.ggml_sqr(ctx0,
+                            c.ggml_sub(ctx0,
+                                c.ggml_add(ctx0,
+                                    c.ggml_mul(ctx0, x, c.ggml_repeat(ctx0, t1, x)),
+                                    c.ggml_repeat(ctx0, t0, x)),
+                                y)
+                            )
+                        ),
+                    c.ggml_new_f32(ctx0, @as(f32, 2.0)*n));
+
+        const res = c.ggml_opt(null, opt_params, f);
+
+        std.debug.print("t0 = {d:.6}\n", .{c.ggml_get_f32_1d(t0, 0)});
+        std.debug.print("t1 = {d:.6}\n", .{c.ggml_get_f32_1d(t1, 0)});
+
+        try std.testing.expect(res == c.GGML_OPT_OK);
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0),  5.0, 1e-3));
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 10.0, 1e-3));
+    }
+
+    {
+        const t0 = c.ggml_new_f32(ctx0, -1.0);
+        const t1 = c.ggml_new_f32(ctx0,  9.0);
+
+        _ = c.ggml_set_param(ctx0, t0);
+        _ = c.ggml_set_param(ctx0, t1);
+
+        // f = 0.5*sum_i[abs(t0 + t1*x_i - y_i)]/n
+        const f =
+            c.ggml_mul(ctx0,
+                    c.ggml_new_f32(ctx0, @as(f32, 1.0)/(2*n)),
+                    c.ggml_sum(ctx0,
+                        c.ggml_abs(ctx0,
+                            c.ggml_sub(ctx0,
+                                c.ggml_add(ctx0,
+                                    c.ggml_mul(ctx0, x, c.ggml_repeat(ctx0, t1, x)),
+                                    c.ggml_repeat(ctx0, t0, x)),
+                                y)
+                            )
+                        )
+                    );
+
+
+        const res = c.ggml_opt(null, opt_params, f);
+
+        try std.testing.expect(res == c.GGML_OPT_OK);
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0),  5.0, 1e-2));
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 10.0, 1e-2));
+    }
+
+    {
+        const t0 = c.ggml_new_f32(ctx0,  5.0);
+        const t1 = c.ggml_new_f32(ctx0, -4.0);
+
+        _ = c.ggml_set_param(ctx0, t0);
+        _ = c.ggml_set_param(ctx0, t1);
+
+        // f = t0^2 + t1^2
+        const f =
+            c.ggml_add(ctx0,
+                    c.ggml_sqr(ctx0, t0),
+                    c.ggml_sqr(ctx0, t1)
+                    );
+
+        const res = c.ggml_opt(null, opt_params, f);
+
+        try std.testing.expect(res == c.GGML_OPT_OK);
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(f,  0), 0.0, 1e-3));
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 0.0, 1e-3));
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 0.0, 1e-3));
+    }
+
+    /////////////////////////////////////////
+
+    {
+        const t0 = c.ggml_new_f32(ctx0, -7.0);
+        const t1 = c.ggml_new_f32(ctx0,  8.0);
+
+        _ = c.ggml_set_param(ctx0, t0);
+        _ = c.ggml_set_param(ctx0, t1);
+
+        // f = (t0 + 2*t1 - 7)^2 + (2*t0 + t1 - 5)^2
+        const f =
+            c.ggml_add(ctx0,
+                    c.ggml_sqr(ctx0,
+                        c.ggml_sub(ctx0,
+                            c.ggml_add(ctx0,
+                                t0,
+                                c.ggml_mul(ctx0, t1, c.ggml_new_f32(ctx0, 2.0))),
+                            c.ggml_new_f32(ctx0, 7.0)
+                            )
+                        ),
+                    c.ggml_sqr(ctx0,
+                        c.ggml_sub(ctx0,
+                            c.ggml_add(ctx0,
+                                c.ggml_mul(ctx0, t0, c.ggml_new_f32(ctx0, 2.0)),
+                                t1),
+                            c.ggml_new_f32(ctx0, 5.0)
+                            )
+                        )
+                    );
+
+        const res = c.ggml_opt(null, opt_params, f);
+
+        try std.testing.expect(res == c.GGML_OPT_OK);
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(f,  0), 0.0, 1e-3));
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 1.0, 1e-3));
+        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 3.0, 1e-3));
+    }
+
+    _ = try std.io.getStdIn().reader().readByte();
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test3.c b/stable-diffusion.cpp/ggml/tests/test3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b92d6233dcaadbbce8db236e69ce124e0e59e154
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test3.c
@@ -0,0 +1,95 @@
+#include "ggml/ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+bool is_close(float a, float b, float epsilon) {
+    return fabs(a - b) < epsilon;
+}
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        .mem_size   = 1024*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+
+    //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
+    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_LBFGS);
+
+    opt_params.n_threads = (argc > 1) ? atoi(argv[1]) : 8;
+
+    const int NP = 1 << 12;
+    const int NF = 1 << 8;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * F = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, NF, NP);
+    struct ggml_tensor * l = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, NP);
+
+    // regularization weight
+    struct ggml_tensor * lambda = ggml_new_f32(ctx0, 1e-5f);
+
+    srand(0);
+
+    for (int j = 0; j < NP; j++) {
+        const float ll = j < NP/2 ? 1.0f : -1.0f;
+        ((float *)l->data)[j] = ll;
+
+        for (int i = 0; i < NF; i++) {
+            ((float *)F->data)[j*NF + i] = ((ll > 0 && i < NF/2 ? 1.0f : ll < 0 && i >= NF/2 ? 1.0f : 0.0f) + ((float)rand()/(float)RAND_MAX - 0.5f)*0.1f)/(0.5f*NF);
+        }
+    }
+
+    {
+        // initial guess
+        struct ggml_tensor * x = ggml_set_f32(ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, NF), 0.0f);
+
+        ggml_set_param(ctx0, x);
+
+        // f = sum[(fj*x - l)^2]/n + lambda*|x^2|
+        struct ggml_tensor * f =
+            ggml_add(ctx0,
+                    ggml_div(ctx0,
+                        ggml_sum(ctx0,
+                            ggml_sqr(ctx0,
+                                ggml_sub(ctx0,
+                                    ggml_mul_mat(ctx0, F, x),
+                                    l)
+                                )
+                            ),
+                        ggml_new_f32(ctx0, (float)NP)
+                        ),
+                    ggml_mul(ctx0,
+                        ggml_sum(ctx0, ggml_sqr(ctx0, x)),
+                        lambda)
+                    );
+
+        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
+
+        GGML_ASSERT(res == GGML_OPT_OK);
+
+        // print results
+        for (int i = 0; i < 16; i++) {
+            printf("x[%3d] = %g\n", i, ((float *)x->data)[i]);
+        }
+        printf("...\n");
+        for (int i = NF - 16; i < NF; i++) {
+            printf("x[%3d] = %g\n", i, ((float *)x->data)[i]);
+        }
+        printf("\n");
+
+        for (int i = 0; i < NF; ++i) {
+            if (i < NF/2) {
+                GGML_ASSERT(is_close(((float *)x->data)[i],  1.0f, 1e-2f));
+            } else {
+                GGML_ASSERT(is_close(((float *)x->data)[i], -1.0f, 1e-2f));
+            }
+        }
+    }
+
+    ggml_free(ctx0);
+
+    return 0;
+}
diff --git a/stable-diffusion.cpp/ggml/tests/test3.zig b/stable-diffusion.cpp/ggml/tests/test3.zig
new file mode 100644
index 0000000000000000000000000000000000000000..2c9f002f3181c61bc8cdda42d793344090689e35
--- /dev/null
+++ b/stable-diffusion.cpp/ggml/tests/test3.zig
@@ -0,0 +1,102 @@
+const std = @import("std");
+const Thread = std.Thread;
+const c = @cImport({
+    @cInclude("stdlib.h");
+    @cInclude("ggml/ggml.h");
+});
+
+fn is_close(a: f32, b: f32, epsilon: f32) bool {
+    return std.math.fabs(a - b) < epsilon;
+}
+
+pub fn main() !void {
+    const params = .{
+        .mem_size   = 128*1024*1024,
+        .mem_buffer = null,
+        .no_alloc   = false,
+    };
+
+    var opt_params = c.ggml_opt_default_params(c.GGML_OPT_LBFGS);
+    
+    const nthreads = try Thread.getCpuCount();
+    opt_params.n_threads = @intCast(nthreads);
+
+    const NP = 1 << 12;
+    const NF = 1 << 8;
+
+    const ctx0 = c.ggml_init(params);
+    defer c.ggml_free(ctx0);
+
+    const F = c.ggml_new_tensor_2d(ctx0, c.GGML_TYPE_F32, NF, NP);
+    const l = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, NP);
+
+    // regularization weight
+    const lambda = c.ggml_new_f32(ctx0, 1e-5);
+
+    c.srand(0);
+
+    const l_data_pointer: [*]f32 = @ptrCast(@alignCast(l.*.data));
+    const f_data_pointer: [*]f32 = @ptrCast(@alignCast(F.*.data));
+    for (0..NP) |j| {
+        const ll = if (j < NP/2) @as(f32, 1.0) else @as(f32, -1.0);
+        l_data_pointer[j] = ll;
+        
+        for (0..NF) |i| {
+            const c_rand: f32 = @floatFromInt(c.rand());
+            f_data_pointer[j*NF + i] = 
+                ((if (ll > 0 and i < NF/2) @as(f32, 1.0) else 
+                    if (ll < 0 and i >= NF/2) @as(f32, 1.0) else @as(f32, 0.0)) + 
+                        (c_rand/c.RAND_MAX - 0.5) * 0.1) / (0.5 * NF);
+        }
+    }
+
+    {
+        // initial guess
+        const x = c.ggml_set_f32(c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, NF), 0.0);
+
+        c.ggml_set_param(ctx0, x);
+
+        // f = sum[(fj*x - l)^2]/n + lambda*|x^2|
+        const f =
+            c.ggml_add(ctx0,
+                    c.ggml_div(ctx0,
+                        c.ggml_sum(ctx0,
+                            c.ggml_sqr(ctx0,
+                                c.ggml_sub(ctx0,
+                                    c.ggml_mul_mat(ctx0, F, x),
+                                    l)
+                                )
+                            ),
+                        c.ggml_new_f32(ctx0, @as(f32, NP))
+                        ),
+                    c.ggml_mul(ctx0,
+                        c.ggml_sum(ctx0, c.ggml_sqr(ctx0, x)),
+                        lambda)
+                    );
+
+        const res = c.ggml_opt(null, opt_params, f);
+
+        try std.testing.expect(res == c.GGML_OPT_OK);
+
+        const x_data_pointer: [*]f32 = @ptrCast(@alignCast(x.*.data));
+        // print results
+        for (0..16) |i| {
+            std.debug.print("x[{d:3}] = {d:.6}\n", .{i, x_data_pointer[i]});
+        }
+        std.debug.print("...\n", .{});
+        for (NF - 16..NF) |i| {
+            std.debug.print("x[{d:3}] = {d:.6}\n", .{i, x_data_pointer[i]});
+        }
+        std.debug.print("\n", .{});
+
+        for (0..NF) |i| {
+            if (i < NF/2) {
+                try std.testing.expect(is_close(x_data_pointer[i], 1.0, 1e-2));
+            } else {
+                try std.testing.expect(is_close(x_data_pointer[i], -1.0, 1e-2));
+            }
+        }
+    }
+
+    _ = try std.io.getStdIn().reader().readByte();
+}
diff --git a/stable-diffusion.cpp/models/.gitignore b/stable-diffusion.cpp/models/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..33417ffd4bdd568b60efd349348359af67450df3
--- /dev/null
+++ b/stable-diffusion.cpp/models/.gitignore
@@ -0,0 +1,5 @@
+*.bin
+*.ckpt
+*.safetensor
+*.safetensors
+*.log
\ No newline at end of file
diff --git a/stable-diffusion.cpp/models/README.md b/stable-diffusion.cpp/models/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb1bab3879da8c6783f1b1e8c2edb35975aae4cb
--- /dev/null
+++ b/stable-diffusion.cpp/models/README.md
@@ -0,0 +1,26 @@
+# Model Convert Script
+
+## Requirements
+
+- vocab.json, from https://huggingface.co/openai/clip-vit-large-patch14/raw/main/vocab.json
+
+
+```shell
+pip install -r requirements.txt
+```
+
+## Usage
+```
+usage: convert.py [-h] [--out_type {f32,f16,q4_0,q4_1,q5_0,q5_1,q8_0}] [--out_file OUT_FILE] model_path
+
+Convert Stable Diffuison model to GGML compatible file format
+
+positional arguments:
+  model_path            model file path (*.pth, *.pt, *.ckpt, *.safetensors)
+
+options:
+  -h, --help            show this help message and exit
+  --out_type {f32,f16,q4_0,q4_1,q5_0,q5_1,q8_0}
+                        output format (default: based on input)
+  --out_file OUT_FILE   path to write to; default: based on input and current working directory
+```
diff --git a/stable-diffusion.cpp/models/convert.py b/stable-diffusion.cpp/models/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ef2fcc35d6dc8fe2385696f532da73c9a67ef3b
--- /dev/null
+++ b/stable-diffusion.cpp/models/convert.py
@@ -0,0 +1,385 @@
+import struct
+import json
+import os
+
+import numpy as np
+import torch
+import safetensors.torch
+
+this_file_dir = os.path.dirname(__file__)
+vocab_dir = this_file_dir
+
+SD1 = 0
+SD2 = 1
+
+ggml_ftype_str_to_int = {
+    "f32": 0,
+    "f16": 1,
+    "q4_0": 2,
+    "q4_1": 3,
+    "q5_0": 8,
+    "q5_1": 9,
+    "q8_0": 7
+}
+
+ggml_ttype_str_to_int = {
+    "f32": 0,
+    "f16": 1,
+    "q4_0": 2,
+    "q4_1": 3,
+    "q5_0": 6,
+    "q5_1": 7,
+    "q8_0": 8
+}
+
+QK4_0 = 32
+def quantize_q4_0(x):
+    assert x.shape[-1] % QK4_0 == 0 and x.shape[-1] > QK4_0
+    x = x.reshape(-1, QK4_0)
+    max = np.take_along_axis(x, np.argmax(np.abs(x), axis=-1)[:, np.newaxis], axis=-1)
+    d = max / -8
+    qs = ((x / d) + 8).round().clip(min=0, max=15).astype(np.int8)
+    half = QK4_0 // 2
+    qs = qs[:, :half] | (qs[:, half:] << 4)
+    d = d.astype(np.float16).view(np.int8)
+    y = np.concatenate((d, qs), axis=-1)
+    return y
+
+QK4_1 = 32
+def quantize_q4_1(x):
+    assert x.shape[-1] % QK4_1 == 0 and x.shape[-1] > QK4_1
+    x = x.reshape(-1, QK4_1)
+    min = np.min(x, axis=-1, keepdims=True)
+    max = np.max(x, axis=-1, keepdims=True)
+    d = (max - min) / ((1 << 4) - 1)
+    qs = ((x - min) / d).round().clip(min=0, max=15).astype(np.int8)
+    half = QK4_1 // 2
+    qs = qs[:, :half] | (qs[:, half:] << 4)
+    d = d.astype(np.float16).view(np.int8)
+    m = min.astype(np.float16).view(np.int8)
+    y = np.concatenate((d, m, qs), axis=-1)
+    return y
+
+QK5_0 = 32
+def quantize_q5_0(x):
+    assert x.shape[-1] % QK5_0 == 0 and x.shape[-1] > QK5_0
+    x = x.reshape(-1, QK5_0)
+    max = np.take_along_axis(x, np.argmax(np.abs(x), axis=-1)[:, np.newaxis], axis=-1)
+    d = max / -16
+    xi = ((x / d) + 16).round().clip(min=0, max=31).astype(np.int8)
+    half = QK5_0 // 2
+    qs = (xi[:, :half] & 0x0F) | (xi[:, half:] << 4)
+    qh = np.zeros(qs.shape[:-1], dtype=np.int32)
+    for i in range(QK5_0):
+        qh |= ((xi[:, i] & 0x10) >> 4).astype(np.int32) << i
+    d = d.astype(np.float16).view(np.int8)
+    qh = qh[..., np.newaxis].view(np.int8)
+    y = np.concatenate((d, qh, qs), axis=-1)
+    return y
+
+QK5_1 = 32
+def quantize_q5_1(x):
+    assert x.shape[-1] % QK5_1 == 0 and x.shape[-1] > QK5_1
+    x = x.reshape(-1, QK5_1)
+    min = np.min(x, axis=-1, keepdims=True)
+    max = np.max(x, axis=-1, keepdims=True)
+    d = (max - min) / ((1 << 5) - 1)
+    xi = ((x - min) / d).round().clip(min=0, max=31).astype(np.int8)
+    half = QK5_1//2
+    qs = (xi[:, :half] & 0x0F) | (xi[:, half:] << 4)
+    qh = np.zeros(xi.shape[:-1], dtype=np.int32)
+    for i in range(QK5_1):
+        qh |= ((xi[:, i] & 0x10) >> 4).astype(np.int32) << i
+    d = d.astype(np.float16).view(np.int8)
+    m = min.astype(np.float16).view(np.int8)
+    qh = qh[..., np.newaxis].view(np.int8)
+    ndarray = np.concatenate((d, m, qh, qs), axis=-1)
+    return ndarray
+
+QK8_0 = 32
+def quantize_q8_0(x):
+    assert x.shape[-1] % QK8_0 == 0 and x.shape[-1] > QK8_0
+    x = x.reshape(-1, QK8_0)
+    amax = np.max(np.abs(x), axis=-1, keepdims=True) 
+    d = amax / ((1 << 7) - 1)
+    qs = (x / d).round().clip(min=-128, max=127).astype(np.int8)
+    d = d.astype(np.float16).view(np.int8)
+    y = np.concatenate((d, qs), axis=-1)
+    return y
+
+# copy from https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py#L16
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def load_model_from_file(model_path):
+    print("loading model from {}".format(model_path))
+    if model_path.lower().endswith(".safetensors"):
+        pl_sd = safetensors.torch.load_file(model_path, device="cpu")
+    else:
+        pl_sd = torch.load(model_path, map_location="cpu")
+    state_dict = pl_sd["state_dict"] if "state_dict" in pl_sd else pl_sd
+    print("loading model from {} completed".format(model_path))
+    return state_dict
+
+def get_alpha_comprod(linear_start=0.00085, linear_end=0.0120, timesteps=1000):
+    betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, timesteps, dtype=torch.float32) ** 2
+    alphas = 1. - betas
+    alphas_cumprod = np.cumprod(alphas.numpy(), axis=0)
+    return torch.tensor(alphas_cumprod)
+
+unused_tensors = [
+    "betas",
+    "alphas_cumprod_prev",
+    "sqrt_alphas_cumprod",
+    "sqrt_one_minus_alphas_cumprod",
+    "log_one_minus_alphas_cumprod",
+    "sqrt_recip_alphas_cumprod",
+    "sqrt_recipm1_alphas_cumprod",
+    "posterior_variance",
+    "posterior_log_variance_clipped",
+    "posterior_mean_coef1",
+    "posterior_mean_coef2",
+    "cond_stage_model.transformer.text_model.embeddings.position_ids",
+    "cond_stage_model.model.logit_scale",
+    "cond_stage_model.model.text_projection",
+    "model_ema.decay",
+    "model_ema.num_updates",
+    "control_model",
+    "lora_te_text_model",
+    "embedding_manager"
+]
+
+
+def preprocess(state_dict):
+    alphas_cumprod = state_dict.get("alphas_cumprod")
+    if alphas_cumprod != None:
+        # print((np.abs(get_alpha_comprod().numpy() - alphas_cumprod.numpy()) < 0.000001).all())
+        pass
+    else:
+        print("no alphas_cumprod in file, generate new one")
+        alphas_cumprod = get_alpha_comprod()
+        state_dict["alphas_cumprod"] = alphas_cumprod
+    
+    new_state_dict = {}
+    for name, w in state_dict.items():
+        # ignore unused tensors
+        if not isinstance(w, torch.Tensor):
+            continue
+        skip = False
+        for unused_tensor in unused_tensors:
+            if name.startswith(unused_tensor):
+                skip = True
+                break
+        if skip:
+            continue
+
+        # # convert BF16 to FP16
+        if w.dtype == torch.bfloat16:
+            w = w.to(torch.float16)
+
+        # convert open_clip to hf CLIPTextModel (for SD2.x)
+        open_clip_to_hf_clip_model = {
+            "cond_stage_model.model.ln_final.bias": "cond_stage_model.transformer.text_model.final_layer_norm.bias",
+            "cond_stage_model.model.ln_final.weight": "cond_stage_model.transformer.text_model.final_layer_norm.weight",
+            "cond_stage_model.model.positional_embedding": "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight",
+            "cond_stage_model.model.token_embedding.weight": "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight",
+            "first_stage_model.decoder.mid.attn_1.to_k.bias": "first_stage_model.decoder.mid.attn_1.k.bias",
+            "first_stage_model.decoder.mid.attn_1.to_k.weight": "first_stage_model.decoder.mid.attn_1.k.weight",
+            "first_stage_model.decoder.mid.attn_1.to_out.0.bias": "first_stage_model.decoder.mid.attn_1.proj_out.bias",
+            "first_stage_model.decoder.mid.attn_1.to_out.0.weight": "first_stage_model.decoder.mid.attn_1.proj_out.weight",
+            "first_stage_model.decoder.mid.attn_1.to_q.bias": "first_stage_model.decoder.mid.attn_1.q.bias",
+            "first_stage_model.decoder.mid.attn_1.to_q.weight": "first_stage_model.decoder.mid.attn_1.q.weight",
+            "first_stage_model.decoder.mid.attn_1.to_v.bias": "first_stage_model.decoder.mid.attn_1.v.bias",
+            "first_stage_model.decoder.mid.attn_1.to_v.weight": "first_stage_model.decoder.mid.attn_1.v.weight",
+        }
+        open_clip_to_hk_clip_resblock = {
+            "attn.out_proj.bias": "self_attn.out_proj.bias",
+            "attn.out_proj.weight": "self_attn.out_proj.weight",
+            "ln_1.bias": "layer_norm1.bias",
+            "ln_1.weight": "layer_norm1.weight",
+            "ln_2.bias": "layer_norm2.bias",
+            "ln_2.weight": "layer_norm2.weight",
+            "mlp.c_fc.bias": "mlp.fc1.bias",
+            "mlp.c_fc.weight": "mlp.fc1.weight",
+            "mlp.c_proj.bias": "mlp.fc2.bias",
+            "mlp.c_proj.weight": "mlp.fc2.weight",
+        }
+        open_clip_resblock_prefix = "cond_stage_model.model.transformer.resblocks."
+        hf_clip_resblock_prefix = "cond_stage_model.transformer.text_model.encoder.layers."
+        if name in open_clip_to_hf_clip_model:
+            new_name = open_clip_to_hf_clip_model[name]
+            print(f"preprocess {name} => {new_name}")
+            name = new_name
+        if name.startswith(open_clip_resblock_prefix):
+            remain = name[len(open_clip_resblock_prefix):]
+            idx = remain.split(".")[0]
+            suffix = remain[len(idx)+1:]
+            if suffix == "attn.in_proj_weight":
+                w_q, w_k, w_v = w.chunk(3)
+                for new_suffix, new_w in zip(["self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"], [w_q, w_k, w_v]):
+                    new_name = hf_clip_resblock_prefix + idx + "." + new_suffix
+                    new_state_dict[new_name] = new_w
+                    print(f"preprocess {name}{w.size()} => {new_name}{new_w.size()}")
+            elif suffix == "attn.in_proj_bias":
+                w_q, w_k, w_v = w.chunk(3)
+                for new_suffix, new_w in zip(["self_attn.q_proj.bias", "self_attn.k_proj.bias", "self_attn.v_proj.bias"], [w_q, w_k, w_v]):
+                    new_name = hf_clip_resblock_prefix + idx + "." + new_suffix
+                    new_state_dict[new_name] = new_w
+                    print(f"preprocess {name}{w.size()} => {new_name}{new_w.size()}")
+            else:
+                new_suffix = open_clip_to_hk_clip_resblock[suffix]
+                new_name = hf_clip_resblock_prefix + idx + "." + new_suffix
+                new_state_dict[new_name] = w
+                print(f"preprocess {name} => {new_name}")
+            continue
+        
+        # convert unet transformer linear to conv2d 1x1
+        if name.startswith("model.diffusion_model.") and (name.endswith("proj_in.weight") or name.endswith("proj_out.weight")):
+            if len(w.shape) == 2:
+                new_w = w.unsqueeze(2).unsqueeze(3)
+                new_state_dict[name] = new_w
+                print(f"preprocess {name} {w.size()} => {name} {new_w.size()}")
+                continue
+
+        # convert vae attn block linear to conv2d 1x1
+        if name.startswith("first_stage_model.") and "attn_1" in name:
+            if len(w.shape) == 2:
+                new_w = w.unsqueeze(2).unsqueeze(3)
+                new_state_dict[name] = new_w
+                print(f"preprocess {name} {w.size()} => {name} {new_w.size()}")
+                continue
+
+        new_state_dict[name] = w
+    return new_state_dict
+
+def convert(model_path, out_type = None, out_file=None):
+    # load model
+    with open(os.path.join(vocab_dir, "vocab.json"), encoding="utf-8") as f:
+        clip_vocab = json.load(f)
+    
+    state_dict = load_model_from_file(model_path)
+    model_type = SD1
+    if "cond_stage_model.model.token_embedding.weight" in state_dict.keys():
+        model_type = SD2
+        print("Stable diffuison 2.x")
+    else:
+        print("Stable diffuison 1.x")
+    state_dict = preprocess(state_dict)
+
+    # output option
+    if out_type == None:
+        weight = state_dict["model.diffusion_model.input_blocks.0.0.weight"].numpy()
+        if weight.dtype == np.float32:
+            out_type = "f32"
+        elif weight.dtype == np.float16:
+            out_type = "f16"
+        elif weight.dtype == np.float64:
+            out_type = "f32"
+        else:
+            raise Exception("unsupported weight type %s" % weight.dtype)
+    if out_file == None:
+        out_file = os.path.splitext(os.path.basename(model_path))[0] + f"-ggml-model-{out_type}.bin"
+        out_file = os.path.join(os.getcwd(), out_file)
+    print(f"Saving GGML compatible file to {out_file}")
+
+    # convert and save
+    with open(out_file, "wb") as file:
+        # magic: ggml in hex
+        file.write(struct.pack("i", 0x67676D6C))
+        # model & file type
+        ftype = (model_type << 16) | ggml_ftype_str_to_int[out_type]
+        file.write(struct.pack("i", ftype))
+
+        # vocab
+        byte_encoder = bytes_to_unicode()
+        byte_decoder = {v: k for k, v in byte_encoder.items()}
+        file.write(struct.pack("i", len(clip_vocab)))
+        for key in clip_vocab:
+            text = bytearray([byte_decoder[c] for c in key])
+            file.write(struct.pack("i", len(text)))
+            file.write(text)
+        
+        # weights
+        for name in state_dict.keys():
+            if not isinstance(state_dict[name], torch.Tensor):
+                continue
+            skip = False
+            for unused_tensor in unused_tensors:
+                if name.startswith(unused_tensor):
+                    skip = True
+                    break
+            if skip:
+                continue
+            if name in unused_tensors:
+                continue
+            data = state_dict[name].numpy()
+
+            n_dims = len(data.shape)
+            shape = data.shape
+            old_type = data.dtype
+
+            ttype = "f32"
+            if n_dims == 4:
+                data = data.astype(np.float16)
+                ttype = "f16"
+            elif n_dims == 2 and name[-7:] == ".weight":
+                if out_type == "f32":
+                    data = data.astype(np.float32)
+                elif out_type == "f16":
+                    data = data.astype(np.float16)
+                elif out_type == "q4_0":
+                    data = quantize_q4_0(data)
+                elif out_type == "q4_1":
+                    data = quantize_q4_1(data)
+                elif out_type == "q5_0":
+                    data = quantize_q5_0(data)
+                elif out_type == "q5_1":
+                    data = quantize_q5_1(data)
+                elif out_type == "q8_0":
+                    data = quantize_q8_0(data)
+                else:
+                    raise Exception("invalid out_type {}".format(out_type))
+                ttype = out_type
+            else:
+                data = data.astype(np.float32)
+                ttype = "f32"
+            
+            print("Processing tensor: {} with shape {}, {} -> {}".format(name, data.shape, old_type, ttype))
+
+            # header
+            name_bytes = name.encode("utf-8")
+            file.write(struct.pack("iii", n_dims, len(name_bytes), ggml_ttype_str_to_int[ttype]))
+            for i in range(n_dims):
+                file.write(struct.pack("i", shape[n_dims - 1 - i]))
+            file.write(name_bytes)
+            # data
+            data.tofile(file)
+        print("Convert done")
+        print(f"Saved GGML compatible file to {out_file}")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Convert Stable Diffuison model to GGML compatible file format")
+    parser.add_argument("--out_type", choices=["f32", "f16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], help="output format (default: based on input)")
+    parser.add_argument("--out_file", help="path to write to; default: based on input and current working directory")
+    parser.add_argument("model_path", help="model file path (*.pth, *.pt, *.ckpt, *.safetensors)")
+    args = parser.parse_args()
+    convert(args.model_path, args.out_type, args.out_file)
diff --git a/stable-diffusion.cpp/models/requirements.txt b/stable-diffusion.cpp/models/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ecd3ed65d8baa5b419211c862fb5e55255267ae0
--- /dev/null
+++ b/stable-diffusion.cpp/models/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+torch
+safetensors
+pytorch_lightning
\ No newline at end of file
diff --git a/stable-diffusion.cpp/models/vocab.json b/stable-diffusion.cpp/models/vocab.json
new file mode 100644
index 0000000000000000000000000000000000000000..4297ea6a8d2bae1fea8f48b45e257814dcb11f69
--- /dev/null
+++ b/stable-diffusion.cpp/models/vocab.json
@@ -0,0 +1 @@
+{"!": 0, "\"": 1, "#": 2, "$": 3, "%": 4, "&": 5, "'": 6, "(": 7, ")": 8, "*": 9, "+": 10, ",": 11, "-": 12, ".": 13, "/": 14, "0": 15, "1": 16, "2": 17, "3": 18, "4": 19, "5": 20, "6": 21, "7": 22, "8": 23, "9": 24, ":": 25, ";": 26, "<": 27, "=": 28, ">": 29, "?": 30, "@": 31, "A": 32, "B": 33, "C": 34, "D": 35, "E": 36, "F": 37, "G": 38, "H": 39, "I": 40, "J": 41, "K": 42, "L": 43, "M": 44, "N": 45, "O": 46, "P": 47, "Q": 48, "R": 49, "S": 50, "T": 51, "U": 52, "V": 53, "W": 54, "X": 55, "Y": 56, "Z": 57, "[": 58, "\\": 59, "]": 60, "^": 61, "_": 62, "`": 63, "a": 64, "b": 65, "c": 66, "d": 67, "e": 68, "f": 69, "g": 70, "h": 71, "i": 72, "j": 73, "k": 74, "l": 75, "m": 76, "n": 77, "o": 78, "p": 79, "q": 80, "r": 81, "s": 82, "t": 83, "u": 84, "v": 85, "w": 86, "x": 87, "y": 88, "z": 89, "{": 90, "|": 91, "}": 92, "~": 93, "¡": 94, "¢": 95, "£": 96, "¤": 97, "¥": 98, "¦": 99, "§": 100, "¨": 101, "©": 102, "ª": 103, "«": 104, "¬": 105, "®": 106, "¯": 107, "°": 108, "±": 109, "²": 110, "³": 111, "´": 112, "µ": 113, "¶": 114, "·": 115, "¸": 116, "¹": 117, "º": 118, "»": 119, "¼": 120, "½": 121, "¾": 122, "¿": 123, "À": 124, "Á": 125, "Â": 126, "Ã": 127, "Ä": 128, "Å": 129, "Æ": 130, "Ç": 131, "È": 132, "É": 133, "Ê": 134, "Ë": 135, "Ì": 136, "Í": 137, "Î": 138, "Ï": 139, "Ð": 140, "Ñ": 141, "Ò": 142, "Ó": 143, "Ô": 144, "Õ": 145, "Ö": 146, "×": 147, "Ø": 148, "Ù": 149, "Ú": 150, "Û": 151, "Ü": 152, "Ý": 153, "Þ": 154, "ß": 155, "à": 156, "á": 157, "â": 158, "ã": 159, "ä": 160, "å": 161, "æ": 162, "ç": 163, "è": 164, "é": 165, "ê": 166, "ë": 167, "ì": 168, "í": 169, "î": 170, "ï": 171, "ð": 172, "ñ": 173, "ò": 174, "ó": 175, "ô": 176, "õ": 177, "ö": 178, "÷": 179, "ø": 180, "ù": 181, "ú": 182, "û": 183, "ü": 184, "ý": 185, "þ": 186, "ÿ": 187, "Ā": 188, "ā": 189, "Ă": 190, "ă": 191, "Ą": 192, "ą": 193, "Ć": 194, "ć": 195, "Ĉ": 196, "ĉ": 197, "Ċ": 198, "ċ": 199, "Č": 200, "č": 201, "Ď": 202, "ď": 203, "Đ": 204, "đ": 205, "Ē": 206, "ē": 207, "Ĕ": 208, "ĕ": 209, "Ė": 210, "ė": 211, "Ę": 212, "ę": 213, "Ě": 214, "ě": 215, "Ĝ": 216, "ĝ": 217, "Ğ": 218, "ğ": 219, "Ġ": 220, "ġ": 221, "Ģ": 222, "ģ": 223, "Ĥ": 224, "ĥ": 225, "Ħ": 226, "ħ": 227, "Ĩ": 228, "ĩ": 229, "Ī": 230, "ī": 231, "Ĭ": 232, "ĭ": 233, "Į": 234, "į": 235, "İ": 236, "ı": 237, "Ĳ": 238, "ĳ": 239, "Ĵ": 240, "ĵ": 241, "Ķ": 242, "ķ": 243, "ĸ": 244, "Ĺ": 245, "ĺ": 246, "Ļ": 247, "ļ": 248, "Ľ": 249, "ľ": 250, "Ŀ": 251, "ŀ": 252, "Ł": 253, "ł": 254, "Ń": 255, "!</w>": 256, "\"</w>": 257, "#</w>": 258, "$</w>": 259, "%</w>": 260, "&</w>": 261, "'</w>": 262, "(</w>": 263, ")</w>": 264, "*</w>": 265, "+</w>": 266, ",</w>": 267, "-</w>": 268, ".</w>": 269, "/</w>": 270, "0</w>": 271, "1</w>": 272, "2</w>": 273, "3</w>": 274, "4</w>": 275, "5</w>": 276, "6</w>": 277, "7</w>": 278, "8</w>": 279, "9</w>": 280, ":</w>": 281, ";</w>": 282, "<</w>": 283, "=</w>": 284, "></w>": 285, "?</w>": 286, "@</w>": 287, "A</w>": 288, "B</w>": 289, "C</w>": 290, "D</w>": 291, "E</w>": 292, "F</w>": 293, "G</w>": 294, "H</w>": 295, "I</w>": 296, "J</w>": 297, "K</w>": 298, "L</w>": 299, "M</w>": 300, "N</w>": 301, "O</w>": 302, "P</w>": 303, "Q</w>": 304, "R</w>": 305, "S</w>": 306, "T</w>": 307, "U</w>": 308, "V</w>": 309, "W</w>": 310, "X</w>": 311, "Y</w>": 312, "Z</w>": 313, "[</w>": 314, "\\</w>": 315, "]</w>": 316, "^</w>": 317, "_</w>": 318, "`</w>": 319, "a</w>": 320, "b</w>": 321, "c</w>": 322, "d</w>": 323, "e</w>": 324, "f</w>": 325, "g</w>": 326, "h</w>": 327, "i</w>": 328, "j</w>": 329, "k</w>": 330, "l</w>": 331, "m</w>": 332, "n</w>": 333, "o</w>": 334, "p</w>": 335, "q</w>": 336, "r</w>": 337, "s</w>": 338, "t</w>": 339, "u</w>": 340, "v</w>": 341, "w</w>": 342, "x</w>": 343, "y</w>": 344, "z</w>": 345, "{</w>": 346, "|</w>": 347, "}</w>": 348, "~</w>": 349, "¡</w>": 350, "¢</w>": 351, "£</w>": 352, "¤</w>": 353, "¥</w>": 354, "¦</w>": 355, "§</w>": 356, "¨</w>": 357, "©</w>": 358, "ª</w>": 359, "«</w>": 360, "¬</w>": 361, "®</w>": 362, "¯</w>": 363, "°</w>": 364, "±</w>": 365, "²</w>": 366, "³</w>": 367, "´</w>": 368, "µ</w>": 369, "¶</w>": 370, "·</w>": 371, "¸</w>": 372, "¹</w>": 373, "º</w>": 374, "»</w>": 375, "¼</w>": 376, "½</w>": 377, "¾</w>": 378, "¿</w>": 379, "À</w>": 380, "Á</w>": 381, "Â</w>": 382, "Ã</w>": 383, "Ä</w>": 384, "Å</w>": 385, "Æ</w>": 386, "Ç</w>": 387, "È</w>": 388, "É</w>": 389, "Ê</w>": 390, "Ë</w>": 391, "Ì</w>": 392, "Í</w>": 393, "Î</w>": 394, "Ï</w>": 395, "Ð</w>": 396, "Ñ</w>": 397, "Ò</w>": 398, "Ó</w>": 399, "Ô</w>": 400, "Õ</w>": 401, "Ö</w>": 402, "×</w>": 403, "Ø</w>": 404, "Ù</w>": 405, "Ú</w>": 406, "Û</w>": 407, "Ü</w>": 408, "Ý</w>": 409, "Þ</w>": 410, "ß</w>": 411, "à</w>": 412, "á</w>": 413, "â</w>": 414, "ã</w>": 415, "ä</w>": 416, "å</w>": 417, "æ</w>": 418, "ç</w>": 419, "è</w>": 420, "é</w>": 421, "ê</w>": 422, "ë</w>": 423, "ì</w>": 424, "í</w>": 425, "î</w>": 426, "ï</w>": 427, "ð</w>": 428, "ñ</w>": 429, "ò</w>": 430, "ó</w>": 431, "ô</w>": 432, "õ</w>": 433, "ö</w>": 434, "÷</w>": 435, "ø</w>": 436, "ù</w>": 437, "ú</w>": 438, "û</w>": 439, "ü</w>": 440, "ý</w>": 441, "þ</w>": 442, "ÿ</w>": 443, "Ā</w>": 444, "ā</w>": 445, "Ă</w>": 446, "ă</w>": 447, "Ą</w>": 448, "ą</w>": 449, "Ć</w>": 450, "ć</w>": 451, "Ĉ</w>": 452, "ĉ</w>": 453, "Ċ</w>": 454, "ċ</w>": 455, "Č</w>": 456, "č</w>": 457, "Ď</w>": 458, "ď</w>": 459, "Đ</w>": 460, "đ</w>": 461, "Ē</w>": 462, "ē</w>": 463, "Ĕ</w>": 464, "ĕ</w>": 465, "Ė</w>": 466, "ė</w>": 467, "Ę</w>": 468, "ę</w>": 469, "Ě</w>": 470, "ě</w>": 471, "Ĝ</w>": 472, "ĝ</w>": 473, "Ğ</w>": 474, "ğ</w>": 475, "Ġ</w>": 476, "ġ</w>": 477, "Ģ</w>": 478, "ģ</w>": 479, "Ĥ</w>": 480, "ĥ</w>": 481, "Ħ</w>": 482, "ħ</w>": 483, "Ĩ</w>": 484, "ĩ</w>": 485, "Ī</w>": 486, "ī</w>": 487, "Ĭ</w>": 488, "ĭ</w>": 489, "Į</w>": 490, "į</w>": 491, "İ</w>": 492, "ı</w>": 493, "Ĳ</w>": 494, "ĳ</w>": 495, "Ĵ</w>": 496, "ĵ</w>": 497, "Ķ</w>": 498, "ķ</w>": 499, "ĸ</w>": 500, "Ĺ</w>": 501, "ĺ</w>": 502, "Ļ</w>": 503, "ļ</w>": 504, "Ľ</w>": 505, "ľ</w>": 506, "Ŀ</w>": 507, "ŀ</w>": 508, "Ł</w>": 509, "ł</w>": 510, "Ń</w>": 511, "in": 512, "th": 513, "an": 514, "re": 515, "ar": 516, "er": 517, "the</w>": 518, "ing</w>": 519, "ou": 520, "on": 521, "st": 522, "or": 523, "en": 524, "on</w>": 525, "al": 526, "at": 527, "er</w>": 528, "it": 529, "in</w>": 530, "to</w>": 531, "ro": 532, "is</w>": 533, "le": 534, "ic": 535, "at</w>": 536, "and</w>": 537, "ed</w>": 538, "of</w>": 539, "ch": 540, "or</w>": 541, "es</w>": 542, "il": 543, "el": 544, "st</w>": 545, "ac": 546, "om": 547, "am": 548, "lo": 549, "an</w>": 550, "ay</w>": 551, "sh": 552, "ri": 553, "li": 554, "ti": 555, "for</w>": 556, "ne": 557, "ðŁ": 558, "ra": 559, "ha": 560, "de": 561, "ol": 562, "ve</w>": 563, "si": 564, "ur": 565, "al</w>": 566, "se": 567, "'s</w>": 568, "un": 569, "di": 570, "be": 571, "la": 572, "wh": 573, "oo": 574, "day</w>": 575, "en</w>": 576, "ma": 577, "no": 578, "le</w>": 579, "to": 580, "our</w>": 581, "ir": 582, "gh": 583, "wit": 584, "it</w>": 585, "yo": 586, "as": 587, "sp": 588, "this</w>": 589, "ts</w>": 590, "ati": 591, "you</w>": 592, "with</w>": 593, "ad": 594, "is": 595, "ab": 596, "ly</w>": 597, "we": 598, "the": 599, "te": 600, "as</w>": 601, "ag": 602, "vi": 603, "pp": 604, "su": 605, "ho": 606, "my</w>": 607, "..": 608, "bu": 609, "com": 610, "se</w>": 611, "ers</w>": 612, "me": 613, "me</w>": 614, "all</w>": 615, "con": 616, "mo": 617, "ke</w>": 618, "ge": 619, "out</w>": 620, "ent</w>": 621, "co": 622, "fe": 623, "ver": 624, "ar</w>": 625, "fro": 626, "au": 627, "po": 628, "ce</w>": 629, "ght</w>": 630, "are</w>": 631, "ss</w>": 632, "from</w>": 633, "ch</w>": 634, "tr": 635, "oun": 636, "one</w>": 637, "by</w>": 638, "do": 639, "th</w>": 640, "wor": 641, "ere</w>": 642, "ke": 643, "pro": 644, "for": 645, "ds</w>": 646, "bo": 647, "ta": 648, "we</w>": 649, "go": 650, "he": 651, "ter</w>": 652, "ing": 653, "de</w>": 654, "be</w>": 655, "ation</w>": 656, "mor": 657, "ay": 658, "ex": 659, "ill</w>": 660, "pe": 661, "ks</w>": 662, "sc": 663, "lu": 664, "fu": 665, "qu": 666, "ver</w>": 667, "ðŁĺ": 668, "ju": 669, "mu": 670, "ate</w>": 671, "and": 672, "ve": 673, "king</w>": 674, "mar": 675, "op": 676, "hi": 677, "...</w>": 678, "pre": 679, "ad</w>": 680, "ru": 681, "that</w>": 682, "jo": 683, "of": 684, "ce": 685, "new</w>": 686, "am</w>": 687, "ap": 688, "gre": 689, "ss": 690, "du": 691, "now</w>": 692, "ye": 693, "ting</w>": 694, "your</w>": 695, "ity</w>": 696, "ni": 697, "ci": 698, "par": 699, "gu": 700, "fi": 701, "af": 702, "per": 703, "ter": 704, "up</w>": 705, "so</w>": 706, "gi": 707, "ons</w>": 708, "gr": 709, "ge</w>": 710, "br": 711, "pl": 712, "'t</w>": 713, "mi": 714, "ine</w>": 715, "wee": 716, "bi": 717, "us</w>": 718, "sho": 719, "have</w>": 720, "today</w>": 721, "av": 722, "man": 723, "ent": 724, "ack</w>": 725, "ure</w>": 726, "our": 727, "âĢ": 728, "cu": 729, "ld</w>": 730, "loo": 731, "im": 732, "ice</w>": 733, "som": 734, "fin": 735, "red</w>": 736, "ren": 737, "ood</w>": 738, "was</w>": 739, "tion</w>": 740, "pi": 741, "ir</w>": 742, "ther</w>": 743, "ty</w>": 744, "ph": 745, "ard</w>": 746, "ec": 747, "!!</w>": 748, "mon": 749, "more</w>": 750, "will</w>": 751, "tra": 752, "can</w>": 753, "col": 754, "pu": 755, "te</w>": 756, "wn</w>": 757, "mb": 758, "so": 759, "iti": 760, "just</w>": 761, "ning</w>": 762, "here</w>": 763, "tu": 764, "pa": 765, "pr": 766, "but</w>": 767, "what</w>": 768, "ally</w>": 769, "fir": 770, "min": 771, "ca": 772, "ant</w>": 773, "sa": 774, "ted</w>": 775, "ev": 776, "ment</w>": 777, "fa": 778, "get</w>": 779, "ame</w>": 780, "about</w>": 781, "gra": 782, "not</w>": 783, "happ": 784, "ays</w>": 785, "man</w>": 786, "his</w>": 787, "time</w>": 788, "like</w>": 789, "gh</w>": 790, "has</w>": 791, "than": 792, "love</w>": 793, "art</w>": 794, "ste": 795, "ding</w>": 796, "he</w>": 797, "cre": 798, "ws</w>": 799, "wat": 800, "der</w>": 801, "ite</w>": 802, "ser": 803, "ace</w>": 804, "age</w>": 805, "end</w>": 806, "str": 807, "aw": 808, "stor": 809, "re</w>": 810, "car": 811, "ell</w>": 812, "all": 813, "ps</w>": 814, "fri": 815, "pho": 816, "por": 817, "do</w>": 818, "ak": 819, "wi": 820, "fre": 821, "who</w>": 822, "shi": 823, "boo": 824, "son</w>": 825, "ell": 826, "when</w>": 827, "ill": 828, "how</w>": 829, "great</w>": 830, "win": 831, "el</w>": 832, "bl": 833, "ssi": 834, "ali": 835, "some</w>": 836, "ðŁĴ": 837, "ton": 838, "der": 839, "les</w>": 840, "pla": 841, "ï¸": 842, "ed": 843, "sch": 844, "hu": 845, "ong</w>": 846, "don</w>": 847, "ki": 848, "sh</w>": 849, "ann": 850, "cor": 851, "..</w>": 852, "ound</w>": 853, "az": 854, "ine": 855, "ary</w>": 856, "ful</w>": 857, "stu": 858, "ould</w>": 859, "sti": 860, "go</w>": 861, "see</w>": 862, "able</w>": 863, "ars</w>": 864, "ll</w>": 865, "mis": 866, "ber": 867, "ck</w>": 868, "wa": 869, "ents</w>": 870, "no</w>": 871, "sig": 872, "fe</w>": 873, "first</w>": 874, "et</w>": 875, "spe": 876, "ack": 877, "if</w>": 878, "ous</w>": 879, "'m</w>": 880, "ster</w>": 881, "app": 882, "ang": 883, "ance</w>": 884, "ans</w>": 885, "good</w>": 886, "bre": 887, "ever": 888, "they</w>": 889, "tic": 890, "come</w>": 891, "off": 892, "back</w>": 893, "ase</w>": 894, "ings</w>": 895, "old</w>": 896, "ight</w>": 897, "fo": 898, "her</w>": 899, "happy</w>": 900, "pic": 901, "its</w>": 902, "ving</w>": 903, "us": 904, "mat": 905, "hom": 906, "dy</w>": 907, "em": 908, "sk": 909, "ying</w>": 910, "their</w>": 911, "led</w>": 912, "ry</w>": 913, "ul": 914, "har": 915, "ck": 916, "ton</w>": 917, "onal</w>": 918, "hel": 919, "ric": 920, "bir": 921, "vie": 922, "way</w>": 923, "tri": 924, "da": 925, "ple": 926, "bro": 927, "sto": 928, "ool</w>": 929, "night</w>": 930, "tru": 931, "ba": 932, "read": 933, "res</w>": 934, "year</w>": 935, "fr": 936, "tor": 937, "als</w>": 938, "coun": 939, "cla": 940, "ture</w>": 941, "vel": 942, "ated</w>": 943, "lec": 944, "end": 945, "thing</w>": 946, "vo": 947, "ici": 948, "best</w>": 949, "can": 950, "work</w>": 951, "last</w>": 952, "after</w>": 953, "ence</w>": 954, "pri": 955, "pe</w>": 956, "es": 957, "il</w>": 958, "âĢ¦</w>": 959, "dre": 960, "ys</w>": 961, "over</w>": 962, "ies</w>": 963, "ðŁĳ": 964, "comm": 965, "tw": 966, "ink</w>": 967, "sun": 968, "cl": 969, "life</w>": 970, "tt": 971, "ach": 972, "land</w>": 973, "sy": 974, "tre": 975, "tal": 976, "pol": 977, "sm": 978, "duc": 979, "sal": 980, "ft</w>": 981, "'re</w>": 982, "che": 983, "war": 984, "tur": 985, "ations</w>": 986, "ach</w>": 987, "ms</w>": 988, "ile</w>": 989, "pm</w>": 990, "ough</w>": 991, "ate": 992, "star": 993, "week</w>": 994, "!!!</w>": 995, "clu": 996, "there</w>": 997, "ner</w>": 998, "tom": 999, "sel": 1000, "ï¸ı</w>": 1001, "world</w>": 1002, "ves</w>": 1003, "cam": 1004, "got</w>": 1005, "inter": 1006, "off</w>": 1007, "um</w>": 1008, "tonight</w>": 1009, "other</w>": 1010, "hou": 1011, "look</w>": 1012, "je": 1013, "id</w>": 1014, "sion</w>": 1015, "beau": 1016, "att": 1017, "eli": 1018, "ort</w>": 1019, "rec": 1020, "ff": 1021, "ster": 1022, "supp": 1023, "gen": 1024, "been</w>": 1025, "ily</w>": 1026, "team</w>": 1027, "mm": 1028, "ic</w>": 1029, "peop": 1030, "itt": 1031, "ats</w>": 1032, "only</w>": 1033, "mber</w>": 1034, "eng": 1035, "bri": 1036, "mp": 1037, "know</w>": 1038, "bur": 1039, "bar": 1040, "ins</w>": 1041, "low</w>": 1042, "she</w>": 1043, "row</w>": 1044, "âĿ": 1045, "tro": 1046, "people</w>": 1047, "via</w>": 1048, "low": 1049, "aga": 1050, "bet": 1051, "xt</w>": 1052, "fac": 1053, "char": 1054, "ear": 1055, "wal": 1056, "sen": 1057, "fam": 1058, "ble</w>": 1059, "nati": 1060, "ish</w>": 1061, "nor": 1062, "game</w>": 1063, "live</w>": 1064, "sco": 1065, "ley</w>": 1066, "don": 1067, "ick</w>": 1068, "ball</w>": 1069, "very</w>": 1070, "these</w>": 1071, "pan": 1072, "ia</w>": 1073, "ating</w>": 1074, "cr": 1075, "are": 1076, "gir": 1077, "make</w>": 1078, "stre": 1079, "show</w>": 1080, ".\"</w>": 1081, "fl": 1082, "up": 1083, "dr": 1084, "thanks</w>": 1085, "illi": 1086, "wom": 1087, "sts</w>": 1088, "ig": 1089, "sur": 1090, "every": 1091, "cur": 1092, "view</w>": 1093, "let</w>": 1094, "into</w>": 1095, "most</w>": 1096, "na": 1097, "indi": 1098, "gar": 1099, "had</w>": 1100, "sou": 1101, "ved</w>": 1102, "ant": 1103, "ition</w>": 1104, "made</w>": 1105, "fol": 1106, "uni": 1107, "ited</w>": 1108, "ðŁı": 1109, "ical</w>": 1110, "thr": 1111, "ready</w>": 1112, "chec": 1113, "dra": 1114, "kes</w>": 1115, "book</w>": 1116, "ep</w>": 1117, "sic</w>": 1118, "morning</w>": 1119, "news</w>": 1120, "cau": 1121, "ct</w>": 1122, "well</w>": 1123, "anc": 1124, "photo</w>": 1125, "than</w>": 1126, "ors</w>": 1127, "birth": 1128, "gg": 1129, "out": 1130, "next</w>": 1131, "some": 1132, "ening</w>": 1133, "story</w>": 1134, "chri": 1135, "down</w>": 1136, "home</w>": 1137, "ffe": 1138, "free</w>": 1139, "da</w>": 1140, "bor": 1141, "fil": 1142, "cial</w>": 1143, "thank</w>": 1144, "side</w>": 1145, "lear": 1146, "que": 1147, "line</w>": 1148, "ten": 1149, "ates</w>": 1150, "years</w>": 1151, "my": 1152, "photo": 1153, "beauti": 1154, "right</w>": 1155, "nu": 1156, "form": 1157, "ship</w>": 1158, "ban": 1159, "ther": 1160, "days</w>": 1161, "gam": 1162, "ason</w>": 1163, "gy</w>": 1164, "ðŁİ": 1165, "birthday</w>": 1166, "set</w>": 1167, "ick": 1168, "et": 1169, "still</w>": 1170, "coming</w>": 1171, "take</w>": 1172, "ðŁĩ": 1173, "bb": 1174, "sol": 1175, "son": 1176, "den": 1177, "ep": 1178, "music</w>": 1179, "them</w>": 1180, "den</w>": 1181, "why</w>": 1182, "foo": 1183, "cra": 1184, "amaz": 1185, "wn": 1186, "hol": 1187, "tting</w>": 1188, "wr": 1189, "ue</w>": 1190, "mag": 1191, "cro": 1192, "lan": 1193, "clo": 1194, "bra": 1195, "ak</w>": 1196, "sing</w>": 1197, "cal": 1198, "read</w>": 1199, "'ve</w>": 1200, "joh": 1201, "bab": 1202, "dri": 1203, "blo": 1204, "big</w>": 1205, "eric": 1206, "int</w>": 1207, "tor</w>": 1208, "try</w>": 1209, "la</w>": 1210, "leg": 1211, "house</w>": 1212, "mic": 1213, "val": 1214, "beautiful</w>": 1215, "litt": 1216, "check</w>": 1217, "new": 1218, "vers": 1219, "sw": 1220, "ari": 1221, "play": 1222, "her": 1223, "âĢĵ</w>": 1224, "win</w>": 1225, "ma</w>": 1226, "congr": 1227, "school</w>": 1228, "fun": 1229, ".@</w>": 1230, "heal": 1231, "ich</w>": 1232, "del": 1233, "where</w>": 1234, "lon": 1235, "ket</w>": 1236, "two</w>": 1237, "much</w>": 1238, "watch</w>": 1239, "ven": 1240, "ded</w>": 1241, "ast</w>": 1242, "ked</w>": 1243, "bas": 1244, "going</w>": 1245, "mp</w>": 1246, "ever</w>": 1247, "ways</w>": 1248, "roo": 1249, "desig": 1250, "ly": 1251, "sed</w>": 1252, "top</w>": 1253, "lin": 1254, "chan": 1255, "too</w>": 1256, "iting</w>": 1257, "dent</w>": 1258, "ghts</w>": 1259, "ty": 1260, "spo": 1261, "need</w>": 1262, "blu": 1263, "inst": 1264, "being</w>": 1265, "âĿ¤": 1266, "wel": 1267, "ls</w>": 1268, "him</w>": 1269, "may</w>": 1270, "sting</w>": 1271, "na</w>": 1272, "ely</w>": 1273, "little</w>": 1274, "ga": 1275, "nat": 1276, "tomor": 1277, "mc": 1278, "hon": 1279, "want</w>": 1280, "air": 1281, "pic</w>": 1282, "americ": 1283, "per</w>": 1284, "less</w>": 1285, "week": 1286, "vel</w>": 1287, "ah</w>": 1288, "cap": 1289, "cham": 1290, "ger": 1291, "tim": 1292, "tomorrow</w>": 1293, "ness</w>": 1294, "state</w>": 1295, "hal": 1296, "serv": 1297, "ze</w>": 1298, "os</w>": 1299, "pat": 1300, "vis": 1301, "exc": 1302, "sin": 1303, "ff</w>": 1304, "city</w>": 1305, "cen": 1306, "any": 1307, "bel": 1308, "summ": 1309, "tin": 1310, "would</w>": 1311, "looking</w>": 1312, "ko": 1313, "cele": 1314, "family</w>": 1315, "mer": 1316, "pow": 1317, "help</w>": 1318, "bus": 1319, "co</w>": 1320, "cle": 1321, "self</w>": 1322, "ens</w>": 1323, "ics</w>": 1324, "tho": 1325, "ani": 1326, "cho": 1327, "lead": 1328, "bs</w>": 1329, "twee": 1330, "think</w>": 1331, "fore</w>": 1332, "chil": 1333, "vide": 1334, "did</w>": 1335, "ale</w>": 1336, "chi": 1337, "vil": 1338, "ends</w>": 1339, "wing</w>": 1340, "pas": 1341, "'ll</w>": 1342, "vol": 1343, "sa</w>": 1344, "gs</w>": 1345, "many</w>": 1346, "jec": 1347, "before</w>": 1348, "graph": 1349, "ny</w>": 1350, "uring</w>": 1351, "wil": 1352, "dd": 1353, "buil": 1354, "fav": 1355, "sted</w>": 1356, "tran": 1357, "ling</w>": 1358, "oud</w>": 1359, "dge</w>": 1360, "fiel": 1361, "national</w>": 1362, "sta": 1363, "cer": 1364, "were</w>": 1365, "ina</w>": 1366, "season</w>": 1367, "cou": 1368, "ned</w>": 1369, "amazing</w>": 1370, "tions</w>": 1371, "celebr": 1372, "ns</w>": 1373, "ath": 1374, "head</w>": 1375, "sday</w>": 1376, "dar": 1377, "loc": 1378, "vin": 1379, "another</w>": 1380, "goo": 1381, "sat": 1382, "ny": 1383, "join</w>": 1384, "pres": 1385, "ses</w>": 1386, "sing": 1387, "ana</w>": 1388, "ining</w>": 1389, "....</w>": 1390, "cour": 1391, "ï¸ı": 1392, "act</w>": 1393, "cause</w>": 1394, "light</w>": 1395, "ams</w>": 1396, "ta</w>": 1397, "bal": 1398, "fc</w>": 1399, "high</w>": 1400, "offici": 1401, "tt</w>": 1402, "christ": 1403, "dic": 1404, "day": 1405, "ral</w>": 1406, "hor": 1407, ":)</w>": 1408, "visi": 1409, "nam": 1410, "ob": 1411, "mas</w>": 1412, "ght": 1413, "really</w>": 1414, "tun": 1415, "find</w>": 1416, "through</w>": 1417, "port</w>": 1418, "ut": 1419, "tive</w>": 1420, "sty": 1421, "ne</w>": 1422, "ore</w>": 1423, "ðŁĺĤ": 1424, "support</w>": 1425, "never</w>": 1426, "even</w>": 1427, "ðŁĶ": 1428, "ha</w>": 1429, "ya</w>": 1430, "ld": 1431, "uk</w>": 1432, "ran": 1433, "jam": 1434, "with": 1435, "medi": 1436, "des</w>": 1437, "ney</w>": 1438, "ching</w>": 1439, "ale": 1440, "hy": 1441, "kin": 1442, "!!": 1443, "dy": 1444, "place</w>": 1445, "also</w>": 1446, "ble": 1447, "which</w>": 1448, "black</w>": 1449, "bli": 1450, "say</w>": 1451, "park</w>": 1452, "play</w>": 1453, "ire</w>": 1454, "video</w>": 1455, "weekend</w>": 1456, "ail": 1457, "key</w>": 1458, "pt</w>": 1459, "ward</w>": 1460, "friday</w>": 1461, "din": 1462, "iness</w>": 1463, "gro": 1464, "ben": 1465, "always</w>": 1466, "tball</w>": 1467, "ago</w>": 1468, "mil": 1469, "cy": 1470, "produc": 1471, "disc": 1472, "under": 1473, "please</w>": 1474, "spor": 1475, "full</w>": 1476, "ey</w>": 1477, "ðŁĻ": 1478, "ise</w>": 1479, "ities</w>": 1480, "cat": 1481, "kno": 1482, "use</w>": 1483, "fore": 1484, "ker</w>": 1485, "art": 1486, "high": 1487, "open</w>": 1488, "san": 1489, "ef": 1490, "ours</w>": 1491, "shed</w>": 1492, "stri": 1493, "dro": 1494, "again</w>": 1495, "im</w>": 1496, "ðŁĵ": 1497, "enjo": 1498, "fun</w>": 1499, "getting</w>": 1500, "pen": 1501, "ger</w>": 1502, "cli": 1503, "any</w>": 1504, "every</w>": 1505, "eu": 1506, "women</w>": 1507, "âľ": 1508, "est</w>": 1509, "could</w>": 1510, "ry": 1511, "\"@</w>": 1512, "thou": 1513, "sha": 1514, "commun": 1515, "ber</w>": 1516, "dents</w>": 1517, "dis": 1518, "while</w>": 1519, "away</w>": 1520, "dio</w>": 1521, "ham": 1522, "gla": 1523, "date</w>": 1524, "ka</w>": 1525, "miss</w>": 1526, "unch</w>": 1527, "won": 1528, "inf": 1529, "room</w>": 1530, "ga</w>": 1531, "real</w>": 1532, "exper": 1533, "direc": 1534, "should</w>": 1535, "spr": 1536, "gol": 1537, "long</w>": 1538, "better</w>": 1539, "ori": 1540, "ey": 1541, "ience</w>": 1542, "ils</w>": 1543, "zz": 1544, "han": 1545, "found</w>": 1546, "vs</w>": 1547, "âĻ": 1548, "post</w>": 1549, "tic</w>": 1550, "part</w>": 1551, "men": 1552, "rence</w>": 1553, "cess</w>": 1554, "vic": 1555, "sil": 1556, "shop</w>": 1557, "ðŁĺĤ</w>": 1558, "food</w>": 1559, "val</w>": 1560, "stic</w>": 1561, "you": 1562, "says</w>": 1563, "elec": 1564, "star</w>": 1565, "oc": 1566, "land": 1567, "id": 1568, "ction</w>": 1569, "field</w>": 1570, "sof": 1571, "start</w>": 1572, "water</w>": 1573, "friends</w>": 1574, "ones</w>": 1575, "ðŁĮ": 1576, "fla": 1577, "far": 1578, "white</w>": 1579, "party</w>": 1580, "inst</w>": 1581, "grou": 1582, "tv</w>": 1583, "everyone</w>": 1584, "ment": 1585, "ja": 1586, "cha": 1587, "prin": 1588, "ants</w>": 1589, "during</w>": 1590, "lat": 1591, "lar": 1592, "west</w>": 1593, "then</w>": 1594, "ka": 1595, "youn": 1596, "insp": 1597, "inte": 1598, "ween</w>": 1599, "visit</w>": 1600, "against</w>": 1601, "rele": 1602, "head": 1603, "ces</w>": 1604, "town</w>": 1605, "looks</w>": 1606, "thre": 1607, "regi": 1608, "rent</w>": 1609, "projec": 1610, "girl</w>": 1611, "sear": 1612, "wo": 1613, "mom": 1614, "car</w>": 1615, "hun": 1616, "publi": 1617, "di</w>": 1618, "ple</w>": 1619, "call</w>": 1620, "cri": 1621, "um": 1622, "ford</w>": 1623, "perfe": 1624, "friend</w>": 1625, "hard</w>": 1626, "ssion</w>": 1627, "test</w>": 1628, "playing</w>": 1629, "around</w>": 1630, "because</w>": 1631, "kets</w>": 1632, "meet</w>": 1633, "satur": 1634, "arti": 1635, "work": 1636, "jun": 1637, "ven</w>": 1638, "run": 1639, "member</w>": 1640, "port": 1641, "super": 1642, "twit": 1643, "sam": 1644, "els</w>": 1645, "tly</w>": 1646, "adv": 1647, "ative</w>": 1648, "ath</w>": 1649, "sure</w>": 1650, "avail": 1651, "lar</w>": 1652, "squ": 1653, "ards</w>": 1654, "event</w>": 1655, "men</w>": 1656, "ll": 1657, "over": 1658, "logy</w>": 1659, "ital</w>": 1660, "times</w>": 1661, "mal": 1662, "back": 1663, "coo": 1664, "making</w>": 1665, "stru": 1666, "âģ": 1667, "itu": 1668, "shar": 1669, "gan</w>": 1670, "cas": 1671, "sn": 1672, "summer</w>": 1673, "picture</w>": 1674, "fan": 1675, "hin": 1676, "christmas</w>": 1677, "cy</w>": 1678, "proud</w>": 1679, "champi": 1680, "design</w>": 1681, "pping</w>": 1682, "hope</w>": 1683, "ca</w>": 1684, "available</w>": 1685, "may": 1686, "wed": 1687, "photograph": 1688, "special</w>": 1689, "sale</w>": 1690, "stop</w>": 1691, "ery</w>": 1692, "awe": 1693, "ality</w>": 1694, "history</w>": 1695, "ama</w>": 1696, "presi": 1697, "bru": 1698, "working</w>": 1699, "done</w>": 1700, "dr</w>": 1701, "ken</w>": 1702, "feat": 1703, "wood</w>": 1704, "atest</w>": 1705, "sunday</w>": 1706, "movi": 1707, "vely</w>": 1708, "sle": 1709, "face</w>": 1710, "spec": 1711, "students</w>": 1712, "by": 1713, "ham</w>": 1714, "spon": 1715, "business</w>": 1716, "dat": 1717, "ie</w>": 1718, "ip": 1719, "soci": 1720, "glo": 1721, "hand": 1722, "recor": 1723, "rs</w>": 1724, "mee": 1725, "keep</w>": 1726, "pur": 1727, "health</w>": 1728, "she": 1729, "comple": 1730, "god</w>": 1731, "davi": 1732, "collec": 1733, "list": 1734, "ra</w>": 1735, "club</w>": 1736, "ters</w>": 1737, "inclu": 1738, "things</w>": 1739, "plan": 1740, "âĺ": 1741, "john</w>": 1742, "shing</w>": 1743, "atul": 1744, "soon</w>": 1745, "blue</w>": 1746, "gor": 1747, "saturday</w>": 1748, "won</w>": 1749, "congratul": 1750, "see": 1751, "âĿ¤ï¸ı</w>": 1752, "those</w>": 1753, "ðŁĺį</w>": 1754, "final</w>": 1755, "dou": 1756, "ith</w>": 1757, "own</w>": 1758, "road</w>": 1759, "tour</w>": 1760, "ast": 1761, "india</w>": 1762, "til</w>": 1763, "nd</w>": 1764, "fer": 1765, "favor": 1766, "sul": 1767, "learn</w>": 1768, "fire</w>": 1769, "just": 1770, "group</w>": 1771, "ah": 1772, "rac": 1773, "body</w>": 1774, "ur</w>": 1775, "care</w>": 1776, "à¸": 1777, "plo": 1778, "oh</w>": 1779, "pos": 1780, "give</w>": 1781, "tech": 1782, "sub": 1783, "cent": 1784, "ering</w>": 1785, "ym": 1786, "ility</w>": 1787, "fic": 1788, "london</w>": 1789, "vir": 1790, "guys</w>": 1791, "ba</w>": 1792, "ðŁ¤": 1793, "baby</w>": 1794, "scre": 1795, "ðŁĺį": 1796, "trump</w>": 1797, "under</w>": 1798, "change</w>": 1799, "ian</w>": 1800, "colle": 1801, "sses</w>": 1802, "ler</w>": 1803, "ssed</w>": 1804, "nice</w>": 1805, "announ": 1806, "power</w>": 1807, "sar": 1808, "aking</w>": 1809, "mini": 1810, "sli": 1811, "swee": 1812, "kar": 1813, "ful": 1814, "cru": 1815, "action</w>": 1816, "ather</w>": 1817, ").</w>": 1818, "stand": 1819, "devel": 1820, "aa": 1821, "gan": 1822, "left</w>": 1823, "lol</w>": 1824, "rel": 1825, "trans": 1826, "ments</w>": 1827, "int": 1828, "ef</w>": 1829, "manag": 1830, "dig": 1831, "gener": 1832, "down": 1833, "pau": 1834, "tiv": 1835, "ku": 1836, "thur": 1837, "ken": 1838, "ston</w>": 1839, "fans</w>": 1840, "talk</w>": 1841, "tweet</w>": 1842, "too": 1843, "style</w>": 1844, "prote": 1845, "secon": 1846, "fron": 1847, "awesome</w>": 1848, "gl": 1849, "pal": 1850, "net": 1851, "sor": 1852, "lau": 1853, "gon": 1854, "since</w>": 1855, "tty</w>": 1856, "series</w>": 1857, "memor": 1858, "beli": 1859, "film</w>": 1860, "did": 1861, "dies</w>": 1862, "ot": 1863, "congratulations</w>": 1864, "pra": 1865, "eve</w>": 1866, "woo": 1867, "official</w>": 1868, "suc": 1869, "incre": 1870, "bon": 1871, "part": 1872, "pped</w>": 1873, "class</w>": 1874, "sive</w>": 1875, "boy</w>": 1876, "cul": 1877, "perfect</w>": 1878, "tou": 1879, "dam": 1880, "welcome</w>": 1881, "football</w>": 1882, "hi</w>": 1883, "pap": 1884, "wait</w>": 1885, "ada</w>": 1886, "congrats</w>": 1887, "young</w>": 1888, "excited</w>": 1889, "rece": 1890, "jan": 1891, "va</w>": 1892, "red": 1893, "stra": 1894, "media</w>": 1895, "'d</w>": 1896, "does</w>": 1897, "let": 1898, "mul": 1899, "ills</w>": 1900, "green</w>": 1901, "mel": 1902, "toge": 1903, "future</w>": 1904, "yester": 1905, "versity</w>": 1906, "form</w>": 1907, "tain</w>": 1908, "ide": 1909, "ches</w>": 1910, "kids</w>": 1911, "qui": 1912, "haha": 1913, "deta": 1914, "big": 1915, "favorite</w>": 1916, "girls</w>": 1917, "contin": 1918, "dom</w>": 1919, "search</w>": 1920, "ual</w>": 1921, "air</w>": 1922, "ders</w>": 1923, "month</w>": 1924, "cer</w>": 1925, "yesterday</w>": 1926, "community</w>": 1927, "ade</w>": 1928, "dog</w>": 1929, "ville</w>": 1930, "ices</w>": 1931, "deli": 1932, "syste": 1933, "run</w>": 1934, "ism</w>": 1935, "heart</w>": 1936, "cup</w>": 1937, "enti": 1938, "few</w>": 1939, "president</w>": 1940, "eds</w>": 1941, "until</w>": 1942, "festi": 1943, "ok": 1944, "flo": 1945, "said</w>": 1946, "ole</w>": 1947, "med": 1948, "travel</w>": 1949, "Â£</w>": 1950, "phone</w>": 1951, "together</w>": 1952, "fast</w>": 1953, "lot</w>": 1954, "games</w>": 1955, "shir": 1956, "between</w>": 1957, "yes</w>": 1958, "thers</w>": 1959, "doing</w>": 1960, "mac": 1961, "ator</w>": 1962, "band</w>": 1963, "follow": 1964, "project</w>": 1965, "develop": 1966, "diffe": 1967, "confe": 1968, "speci": 1969, "cast</w>": 1970, "ys": 1971, "board</w>": 1972, "rd</w>": 1973, "ial</w>": 1974, "shoo": 1975, "ram": 1976, "having</w>": 1977, "share</w>": 1978, "follow</w>": 1979, "one": 1980, "name</w>": 1981, "mr</w>": 1982, "put</w>": 1983, "discu": 1984, "ory</w>": 1985, "came</w>": 1986, "ous": 1987, "site</w>": 1988, "twitter</w>": 1989, "tb": 1990, "tit": 1991, "finally</w>": 1992, "zed</w>": 1993, "super</w>": 1994, "compan": 1995, "using</w>": 1996, "alls</w>": 1997, "list</w>": 1998, "ris</w>": 1999, "shot</w>": 2000, "gal": 2001, "tar": 2002, "del</w>": 2003, "john": 2004, "âĢĶ</w>": 2005, "something</w>": 2006, "ram</w>": 2007, "intere": 2008, "whe": 2009, "bit</w>": 2010, "ðŁį": 2011, "street</w>": 2012, "ound": 2013, "ai": 2014, "tickets</w>": 2015, "movie</w>": 2016, "real": 2017, "ky": 2018, "taking</w>": 2019, "opp": 2020, "cc</w>": 2021, "lam": 2022, "moun": 2023, "inve": 2024, "black": 2025, "used</w>": 2026, "online</w>": 2027, "yor": 2028, "local</w>": 2029, "gue": 2030, "cks</w>": 2031, "ow": 2032, "gest</w>": 2033, "boys</w>": 2034, "illion</w>": 2035, "cont": 2036, "reci": 2037, "ined</w>": 2038, "euro": 2039, "now": 2040, "seen</w>": 2041, "ph</w>": 2042, "teach": 2043, "def": 2044, "south</w>": 2045, "such</w>": 2046, "award</w>": 2047, "must</w>": 2048, "issu": 2049, "care": 2050, "feel</w>": 2051, "plu": 2052, "latest</w>": 2053, "sports</w>": 2054, "web": 2055, "tex": 2056, "ement</w>": 2057, "sk</w>": 2058, "fic</w>": 2059, "wan": 2060, "tech</w>": 2061, "ot</w>": 2062, "box</w>": 2063, "ner": 2064, "free": 2065, "tal</w>": 2066, "ash": 2067, "case</w>": 2068, "hot</w>": 2069, "wonder": 2070, "meeting</w>": 2071, "era</w>": 2072, "chall": 2073, "ðŁĲ": 2074, "job</w>": 2075, "ili": 2076, "cool</w>": 2077, "jour": 2078, "ths</w>": 2079, "mo</w>": 2080, "fel": 2081, "die</w>": 2082, "micha": 2083, "ele": 2084, "team": 2085, "service</w>": 2086, "stand</w>": 2087, "makes</w>": 2088, "ping</w>": 2089, "early</w>": 2090, "comes</w>": 2091, "ek</w>": 2092, "holi": 2093, "vers</w>": 2094, "ague</w>": 2095, "sau": 2096, "three</w>": 2097, "monday</w>": 2098, "fashi": 2099, "someone</w>": 2100, "thro": 2101, "sea</w>": 2102, "bad</w>": 2103, "suppor": 2104, "turn</w>": 2105, "ury</w>": 2106, "ming</w>": 2107, "photography</w>": 2108, "nic": 2109, "mark</w>": 2110, "pretty</w>": 2111, "ssing</w>": 2112, "watching</w>": 2113, "memb": 2114, "arri": 2115, "county</w>": 2116, "beach</w>": 2117, "fran": 2118, "center</w>": 2119, "police</w>": 2120, "bat": 2121, "public</w>": 2122, "tan": 2123, "press</w>": 2124, "saf": 2125, "sy</w>": 2126, "gets</w>": 2127, "roy": 2128, "ners</w>": 2129, "your": 2130, "buy</w>": 2131, "sters</w>": 2132, "show": 2133, "ased</w>": 2134, "childre": 2135, "afric": 2136, "ines</w>": 2137, "space</w>": 2138, "scri": 2139, "hall</w>": 2140, "pain": 2141, "aring</w>": 2142, "home": 2143, "mur": 2144, "health": 2145, "ched</w>": 2146, "sand": 2147, "recei": 2148, "guy</w>": 2149, "ea": 2150, "american</w>": 2151, "resi": 2152, "children</w>": 2153, "--": 2154, "iri": 2155, "ington</w>": 2156, "country</w>": 2157, "ross</w>": 2158, "len</w>": 2159, "anna</w>": 2160, "books</w>": 2161, "bc</w>": 2162, "ece</w>": 2163, "dom": 2164, "lovely</w>": 2165, "kh": 2166, "pet": 2167, "gy": 2168, "gri": 2169, "stage</w>": 2170, "office</w>": 2171, "rock</w>": 2172, "mon</w>": 2173, "bay</w>": 2174, "table</w>": 2175, "sun</w>": 2176, "med</w>": 2177, "thin": 2178, "lor": 2179, "flow": 2180, "(@</w>": 2181, "university</w>": 2182, "store</w>": 2183, "front</w>": 2184, "good": 2185, "za</w>": 2186, "vote</w>": 2187, "north</w>": 2188, "hey</w>": 2189, "anim": 2190, "order</w>": 2191, "mid": 2192, "without</w>": 2193, "ade": 2194, "remember</w>": 2195, "market</w>": 2196, "??</w>": 2197, "mus": 2198, "training</w>": 2199, "educ": 2200, "but": 2201, "cover</w>": 2202, "stan": 2203, "scen": 2204, "bla": 2205, "break": 2206, "lou": 2207, "same</w>": 2208, "gold</w>": 2209, "ain</w>": 2210, "os": 2211, "both</w>": 2212, "lit": 2213, "vern": 2214, "ai</w>": 2215, "albu": 2216, "pa</w>": 2217, "enjoy</w>": 2218, "beg": 2219, "elling</w>": 2220, "thursday</w>": 2221, "info</w>": 2222, "san</w>": 2223, "america</w>": 2224, "hair</w>": 2225, "tel</w>": 2226, "march</w>": 2227, "concer": 2228, "college</w>": 2229, "conference</w>": 2230, "app</w>": 2231, "hour</w>": 2232, "chang": 2233, "âļ": 2234, "sour": 2235, "ols</w>": 2236, "weather</w>": 2237, "war</w>": 2238, "phi": 2239, "festival</w>": 2240, "second</w>": 2241, "cute</w>": 2242, "prac": 2243, "ener": 2244, "stry</w>": 2245, "lea": 2246, "polit": 2247, "sav": 2248, "sen</w>": 2249, "ow</w>": 2250, "mi</w>": 2251, "near</w>": 2252, "ought</w>": 2253, "ze": 2254, "coffe": 2255, "willi": 2256, "dan": 2257, "sey</w>": 2258, "david</w>": 2259, "ese</w>": 2260, "fan</w>": 2261, "deci": 2262, "theat": 2263, "nov": 2264, "ation": 2265, "trac": 2266, "sci": 2267, "review</w>": 2268, "cel": 2269, "em</w>": 2270, "un</w>": 2271, "july</w>": 2272, "orig": 2273, "tion": 2274, "dru": 2275, "former</w>": 2276, "stay</w>": 2277, "after": 2278, "inv": 2279, "took</w>": 2280, "data</w>": 2281, "bal</w>": 2282, "tues": 2283, "dan</w>": 2284, "evening</w>": 2285, "ðŁĺĤðŁĺĤ": 2286, "dol": 2287, "ures</w>": 2288, "provi": 2289, "ts": 2290, "est": 2291, "sign</w>": 2292, "jac": 2293, "uk": 2294, "song</w>": 2295, "yet</w>": 2296, "bow": 2297, "indu": 2298, "jap": 2299, "hoo": 2300, "point</w>": 2301, "anyone</w>": 2302, "zy</w>": 2303, "ist</w>": 2304, "hur": 2305, "ital": 2306, "building</w>": 2307, "woman</w>": 2308, "chur": 2309, "jer": 2310, "perfor": 2311, "coach</w>": 2312, "league</w>": 2313, "cess": 2314, "net</w>": 2315, "imag": 2316, "nation": 2317, "brit": 2318, "que</w>": 2319, "awards</w>": 2320, "ages</w>": 2321, "works</w>": 2322, "ced</w>": 2323, "mance</w>": 2324, "late</w>": 2325, "ign</w>": 2326, "money</w>": 2327, "true</w>": 2328, "ii</w>": 2329, "tell</w>": 2330, "plac": 2331, "pac": 2332, "asy</w>": 2333, "world": 2334, "behin": 2335, "import": 2336, "reading</w>": 2337, "gram</w>": 2338, "giving</w>": 2339, "met</w>": 2340, "hit</w>": 2341, "forward</w>": 2342, "stom": 2343, "present": 2344, "june</w>": 2345, "social</w>": 2346, "noon</w>": 2347, "mart": 2348, "half</w>": 2349, "swe": 2350, "govern": 2351, "ker": 2352, "details</w>": 2353, "lish</w>": 2354, "__": 2355, "acy</w>": 2356, "sia</w>": 2357, "bert</w>": 2358, "fall</w>": 2359, "!!!!</w>": 2360, "),</w>": 2361, "thi": 2362, "diti": 2363, "sport</w>": 2364, "king": 2365, "fit": 2366, "staf": 2367, "cat</w>": 2368, "muse": 2369, "centr": 2370, "yer</w>": 2371, "contro": 2372, "bloo": 2373, "walk</w>": 2374, "actu": 2375, "didn</w>": 2376, "lim": 2377, "learning</w>": 2378, "research</w>": 2379, "wedne": 2380, "auth": 2381, "hours</w>": 2382, "ky</w>": 2383, "far</w>": 2384, "hen": 2385, "....": 2386, "itch": 2387, "ril</w>": 2388, "strong</w>": 2389, "sky</w>": 2390, "questi": 2391, "james</w>": 2392, "ron": 2393, "dg": 2394, "fur": 2395, "cin": 2396, "does": 2397, "appro": 2398, "marke": 2399, "tures</w>": 2400, "fully</w>": 2401, "chat</w>": 2402, "behind</w>": 2403, "tem": 2404, "fini": 2405, "mission</w>": 2406, "batt": 2407, "feel": 2408, "heav": 2409, "everything</w>": 2410, "bar</w>": 2411, "wish</w>": 2412, "premi": 2413, "ima": 2414, "experience</w>": 2415, "each</w>": 2416, "report</w>": 2417, "sweet</w>": 2418, "tics</w>": 2419, "spring</w>": 2420, "respon": 2421, "system</w>": 2422, "victor": 2423, "lin</w>": 2424, "saw</w>": 2425, "already</w>": 2426, "ghter</w>": 2427, "fle": 2428, "ãĥ": 2429, "bring</w>": 2430, "album</w>": 2431, "--</w>": 2432, "ells</w>": 2433, "stan</w>": 2434, "tom</w>": 2435, "international</w>": 2436, "went</w>": 2437, "anni": 2438, "match</w>": 2439, "pper</w>": 2440, "stone</w>": 2441, "small</w>": 2442, "rain</w>": 2443, "fashion</w>": 2444, "area</w>": 2445, "van": 2446, "agram</w>": 2447, "ko</w>": 2448, "thought</w>": 2449, "worth</w>": 2450, "van</w>": 2451, "mer</w>": 2452, "coffee</w>": 2453, "ites</w>": 2454, "gn": 2455, "artist</w>": 2456, "con</w>": 2457, "arch": 2458, "cir": 2459, "secre": 2460, "ground</w>": 2461, "iso": 2462, "hand</w>": 2463, "com</w>": 2464, "bridge</w>": 2465, "hs</w>": 2466, "xi": 2467, "link</w>": 2468, "pul": 2469, "spl": 2470, "race</w>": 2471, "fli": 2472, "river</w>": 2473, "gas</w>": 2474, "disco": 2475, "dal": 2476, "player</w>": 2477, "fit</w>": 2478, "photos</w>": 2479, "ity": 2480, "ok</w>": 2481, "jor": 2482, "tra</w>": 2483, "april</w>": 2484, "ads</w>": 2485, "adi": 2486, "solu": 2487, "beauty</w>": 2488, "door</w>": 2489, "mess": 2490, "update</w>": 2491, "alia</w>": 2492, "scho": 2493, "ened</w>": 2494, "moment</w>": 2495, "scot": 2496, "science</w>": 2497, "ior</w>": 2498, "ties</w>": 2499, "across</w>": 2500, "ously</w>": 2501, "shes</w>": 2502, "doesn</w>": 2503, "page</w>": 2504, "water": 2505, "million</w>": 2506, "classi": 2507, "lic": 2508, "cast": 2509, "formation</w>": 2510, "michael</w>": 2511, "ello</w>": 2512, "smo": 2513, "ints</w>": 2514, "vision</w>": 2515, "opening</w>": 2516, "ldn</w>": 2517, "austr": 2518, "tuesday</w>": 2519, "winner</w>": 2520, "possi": 2521, "round</w>": 2522, "shirt</w>": 2523, "dit</w>": 2524, "bo</w>": 2525, "ues</w>": 2526, "illed</w>": 2527, "along</w>": 2528, "trip</w>": 2529, "starting</w>": 2530, "impro": 2531, "kan": 2532, "person</w>": 2533, "not": 2534, "reco": 2535, "needs</w>": 2536, "cle</w>": 2537, "lie</w>": 2538, "rest</w>": 2539, "ring</w>": 2540, "winter</w>": 2541, "simp": 2542, "mom</w>": 2543, "beer</w>": 2544, "face": 2545, "tors</w>": 2546, "usa</w>": 2547, "collection</w>": 2548, "geor": 2549, "session</w>": 2550, "trying</w>": 2551, "las</w>": 2552, "lake</w>": 2553, "jen": 2554, "origin": 2555, "student</w>": 2556, "secur": 2557, "vin</w>": 2558, "pics</w>": 2559, "expe": 2560, "comp": 2561, "gonna</w>": 2562, "equ": 2563, "bad": 2564, "ley": 2565, "au</w>": 2566, "members</w>": 2567, "break</w>": 2568, "wall</w>": 2569, "gic</w>": 2570, "dinner</w>": 2571, "bul": 2572, "inspir": 2573, "ri</w>": 2574, "mind</w>": 2575, "ica</w>": 2576, "winning</w>": 2577, "talking</w>": 2578, "tren": 2579, "sis</w>": 2580, "ten</w>": 2581, "wonderful</w>": 2582, "snow</w>": 2583, "hear</w>": 2584, "thom": 2585, "nothing</w>": 2586, "gui": 2587, "stin": 2588, "blog</w>": 2589, "fest</w>": 2590, "bun": 2591, "lee</w>": 2592, "wards</w>": 2593, "chance</w>": 2594, "dress</w>": 2595, "ren</w>": 2596, "paul</w>": 2597, "pes</w>": 2598, "techno": 2599, "russi": 2600, "card</w>": 2601, "east</w>": 2602, "mari": 2603, "wine</w>": 2604, "ti</w>": 2605, "law</w>": 2606, "stric": 2607, "ki</w>": 2608, "ape</w>": 2609, "augu": 2610, "profe": 2611, "ash</w>": 2612, "course</w>": 2613, "mail</w>": 2614, "rently</w>": 2615, "dun": 2616, "mun": 2617, "love": 2618, "island</w>": 2619, "drive</w>": 2620, "sl": 2621, "ended</w>": 2622, "main</w>": 2623, "lost</w>": 2624, "nature</w>": 2625, "âĿ¤ï¸ı": 2626, "chic": 2627, "repor": 2628, "pin": 2629, "pro</w>": 2630, "station</w>": 2631, "cep": 2632, "takes</w>": 2633, "company</w>": 2634, "goes</w>": 2635, "ond</w>": 2636, "mach": 2637, "radio</w>": 2638, "dad</w>": 2639, "rock": 2640, "ja</w>": 2641, "pay": 2642, "champion": 2643, "ee": 2644, "inde": 2645, "tta</w>": 2646, "atic</w>": 2647, "tab": 2648, "believe</w>": 2649, "energy</w>": 2650, "zi": 2651, "tat": 2652, "word</w>": 2653, "once</w>": 2654, "resul": 2655, "yl": 2656, "andre": 2657, "ano</w>": 2658, "instagram</w>": 2659, "close</w>": 2660, "tam": 2661, "custom": 2662, "wa</w>": 2663, "conom": 2664, "shows</w>": 2665, "life": 2666, "kin</w>": 2667, "rob": 2668, "tage</w>": 2669, "nation</w>": 2670, "almost</w>": 2671, "listen</w>": 2672, "save</w>": 2673, "reli": 2674, "ace": 2675, "mary</w>": 2676, "tree</w>": 2677, "forget</w>": 2678, "jack": 2679, "waiting</w>": 2680, "director</w>": 2681, "hill</w>": 2682, "born</w>": 2683, "temp": 2684, "fl</w>": 2685, "ste</w>": 2686, "ona</w>": 2687, "single</w>": 2688, "wednesday</w>": 2689, "united</w>": 2690, "ino</w>": 2691, "@_</w>": 2692, "nel</w>": 2693, "celebrate</w>": 2694, "ending</w>": 2695, "deal</w>": 2696, "ji</w>": 2697, "canada</w>": 2698, "huge</w>": 2699, "track</w>": 2700, "âĢ¢</w>": 2701, "fy</w>": 2702, "fanta": 2703, "ang</w>": 2704, "york</w>": 2705, "release</w>": 2706, "pun": 2707, "episo": 2708, "words</w>": 2709, "tour": 2710, "pack": 2711, "igh": 2712, "classic</w>": 2713, "performance</w>": 2714, "ket": 2715, "afternoon</w>": 2716, "record</w>": 2717, "wins</w>": 2718, "proble": 2719, "âĿ¤</w>": 2720, "four</w>": 2721, "bed</w>": 2722, "bank</w>": 2723, "dance</w>": 2724, "sla": 2725, "called</w>": 2726, "might</w>": 2727, "ap</w>": 2728, "past</w>": 2729, "ðŁļ": 2730, "different</w>": 2731, "ite": 2732, "gift</w>": 2733, "ssive</w>": 2734, "church</w>": 2735, "cus</w>": 2736, "program</w>": 2737, "hotel</w>": 2738, "ice": 2739, "mad": 2740, "security</w>": 2741, "enge</w>": 2742, "dc</w>": 2743, "enough</w>": 2744, "sta</w>": 2745, "ety</w>": 2746, "dead</w>": 2747, "gun": 2748, "hear": 2749, "mir": 2750, "human</w>": 2751, "gress</w>": 2752, "ounds</w>": 2753, "piece</w>": 2754, "breaking</w>": 2755, "garden</w>": 2756, "fight</w>": 2757, "views</w>": 2758, "fish</w>": 2759, "started</w>": 2760, "running</w>": 2761, "green": 2762, "seri": 2763, "sm</w>": 2764, "ask</w>": 2765, "dor": 2766, "death</w>": 2767, "econom": 2768, "eri": 2769, "ird</w>": 2770, "ser</w>": 2771, "lunch</w>": 2772, "âģ¦": 2773, "box": 2774, "natu": 2775, "base": 2776, "ban</w>": 2777, "fal": 2778, "global</w>": 2779, "wild": 2780, "wow</w>": 2781, "outside</w>": 2782, "move</w>": 2783, "lead</w>": 2784, "anal": 2785, "museum</w>": 2786, "ong": 2787, "haw": 2788, "power": 2789, "thank": 2790, "bac": 2791, "charac": 2792, "campa": 2793, "digital</w>": 2794, "ro</w>": 2795, "oper": 2796, "dev": 2797, "wol": 2798, "pati": 2799, "fa</w>": 2800, "male</w>": 2801, "paper</w>": 2802, "illing</w>": 2803, "cs</w>": 2804, "âĥ": 2805, "education</w>": 2806, "taken</w>": 2807, "effe": 2808, "mou": 2809, "sad": 2810, "\".</w>": 2811, "based</w>": 2812, "staff</w>": 2813, "including</w>": 2814, "living</w>": 2815, "ac</w>": 2816, "china</w>": 2817, "mob": 2818, "storm</w>": 2819, "luck</w>": 2820, "phil": 2821, "oo</w>": 2822, "yn": 2823, "travel": 2824, "kel": 2825, "tial</w>": 2826, "price</w>": 2827, "book": 2828, "important</w>": 2829, "bio": 2830, "pool</w>": 2831, "nyc</w>": 2832, "fab": 2833, "load</w>": 2834, "?!</w>": 2835, "challenge</w>": 2836, "cry": 2837, "serve</w>": 2838, "wear</w>": 2839, "bus</w>": 2840, "tain": 2841, "number</w>": 2842, "ror</w>": 2843, "kat": 2844, "iz": 2845, "though</w>": 2846, "hosp": 2847, "mm</w>": 2848, "fair</w>": 2849, "utes</w>": 2850, "hot": 2851, "pop</w>": 2852, "fied</w>": 2853, "camp": 2854, "development</w>": 2855, "libr": 2856, "cali": 2857, "ems</w>": 2858, "âģ¦@</w>": 2859, "bol": 2860, "ised</w>": 2861, "standing</w>": 2862, "model</w>": 2863, "ita</w>": 2864, "gle</w>": 2865, "brown</w>": 2866, "image</w>": 2867, "vered</w>": 2868, "force</w>": 2869, "oil</w>": 2870, "partic": 2871, "shu": 2872, "daily</w>": 2873, "law": 2874, "sec": 2875, "class": 2876, "camp</w>": 2877, "holiday</w>": 2878, "clin": 2879, "kers</w>": 2880, "present</w>": 2881, "game": 2882, "incredi": 2883, "ership</w>": 2884, "interview</w>": 2885, "bill</w>": 2886, "due</w>": 2887, "andy</w>": 2888, "abo": 2889, "innov": 2890, "key": 2891, "acade": 2892, "pil": 2893, "moder": 2894, "stars</w>": 2895, "brand</w>": 2896, "fer</w>": 2897, "weeks</w>": 2898, "consi": 2899, "pre</w>": 2900, "safe": 2901, "writ": 2902, "dium</w>": 2903, "launch</w>": 2904, "marketing</w>": 2905, "annual</w>": 2906, "assi": 2907, "court</w>": 2908, "lady</w>": 2909, "cted</w>": 2910, "anda</w>": 2911, "inside</w>": 2912, "child</w>": 2913, "oppor": 2914, "smith</w>": 2915, "centre</w>": 2916, "gue</w>": 2917, "âģ©</w>": 2918, "fren": 2919, "sty</w>": 2920, "fort</w>": 2921, "ently</w>": 2922, "isn</w>": 2923, "keep": 2924, "tober</w>": 2925, "ony</w>": 2926, "boy": 2927, "ald</w>": 2928, "colla": 2929, "demo": 2930, "level</w>": 2931, "compet": 2932, "ado</w>": 2933, "bour": 2934, "fantastic</w>": 2935, "mate</w>": 2936, "su</w>": 2937, "south": 2938, "opportun": 2939, "versary</w>": 2940, "later</w>": 2941, "bud": 2942, "facebook</w>": 2943, "laun": 2944, "stern</w>": 2945, "pit": 2946, "!\"</w>": 2947, "maj": 2948, "gram": 2949, "tbt</w>": 2950, "fire": 2951, "happy": 2952, "aks</w>": 2953, "whole</w>": 2954, "actually</w>": 2955, "iller</w>": 2956, "ella</w>": 2957, "lots</w>": 2958, "alex": 2959, "ange": 2960, "lands</w>": 2961, "ðŁĺŃ": 2962, "enter": 2963, "rou": 2964, "episode</w>": 2965, "ped</w>": 2966, "inten": 2967, "shire</w>": 2968, "who": 2969, "plan</w>": 2970, "ho</w>": 2971, "cake</w>": 2972, "west": 2973, "magaz": 2974, "fresh</w>": 2975, "cc": 2976, "nar": 2977, "chris</w>": 2978, "writing</w>": 2979, "wer</w>": 2980, "nom": 2981, "lo</w>": 2982, "midd": 2983, "dream</w>": 2984, "ol</w>": 2985, "tional</w>": 2986, "deb": 2987, ">></w>": 2988, "become</w>": 2989, "si</w>": 2990, "grand</w>": 2991, "alling</w>": 2992, "histor": 2993, "ride</w>": 2994, "ired</w>": 2995, "safe</w>": 2996, "queen</w>": 2997, "cil</w>": 2998, "intro": 2999, "vil</w>": 3000, "dani": 3001, "...": 3002, "artic": 3003, "stat": 3004, "short</w>": 3005, "oring</w>": 3006, "selfi": 3007, "missi": 3008, "doc": 3009, "bit": 3010, "gall": 3011, "bom": 3012, "ire": 3013, "selec": 3014, "dition</w>": 3015, "ðŁĶ¥</w>": 3016, "friend": 3017, "beat</w>": 3018, "ghting</w>": 3019, "ðŁĺĬ</w>": 3020, "peace</w>": 3021, "exhi": 3022, "anta</w>": 3023, "ability</w>": 3024, "illu": 3025, "jon": 3026, "quality</w>": 3027, "tribu": 3028, "mes</w>": 3029, "players</w>": 3030, "fair": 3031, "cut</w>": 3032, "cab": 3033, "success</w>": 3034, "bi</w>": 3035, "sus</w>": 3036, "promo": 3037, "sche": 3038, "ange</w>": 3039, "ico</w>": 3040, "commit": 3041, "catch</w>": 3042, "illa</w>": 3043, "kind</w>": 3044, "feeling</w>": 3045, "quo": 3046, "say": 3047, "anniversary</w>": 3048, "spot</w>": 3049, "mother</w>": 3050, "ane</w>": 3051, "pend": 3052, "yourself</w>": 3053, "ops</w>": 3054, "apple</w>": 3055, "minutes</w>": 3056, "po</w>": 3057, "grand": 3058, "ries</w>": 3059, "haha</w>": 3060, "career</w>": 3061, "edition</w>": 3062, "dec": 3063, "rick</w>": 3064, "ami</w>": 3065, "concert</w>": 3066, "itive</w>": 3067, "geous</w>": 3068, "dly</w>": 3069, "tte</w>": 3070, "advent": 3071, "ig</w>": 3072, "lights</w>": 3073, "aker</w>": 3074, "sky": 3075, "âĥ£</w>": 3076, "ray</w>": 3077, "finished</w>": 3078, "way": 3079, "sd": 3080, "accoun": 3081, "ðŁĴķ</w>": 3082, "cky</w>": 3083, "chel": 3084, "liter": 3085, "painting</w>": 3086, "los</w>": 3087, "stun": 3088, "technology</w>": 3089, "nas": 3090, "mar</w>": 3091, "bil": 3092, "africa</w>": 3093, "kie</w>": 3094, "eyes</w>": 3095, "golf</w>": 3096, "plus</w>": 3097, "nia</w>": 3098, "itec": 3099, "services</w>": 3100, "wedding</w>": 3101, "known</w>": 3102, "tele": 3103, ".....</w>": 3104, "starts</w>": 3105, "paren": 3106, "wants</w>": 3107, "ational</w>": 3108, "months</w>": 3109, "windo": 3110, "favour": 3111, "ert</w>": 3112, "magazine</w>": 3113, "exclu": 3114, "reve": 3115, "bc": 3116, "original</w>": 3117, "ess": 3118, "nal</w>": 3119, "anti": 3120, "stro": 3121, "tice</w>": 3122, "study</w>": 3123, "à¤": 3124, "vac": 3125, "national": 3126, "five</w>": 3127, "rain": 3128, "vement</w>": 3129, "ute</w>": 3130, "verse</w>": 3131, "emer": 3132, "army</w>": 3133, "possible</w>": 3134, "guess</w>": 3135, "valley</w>": 3136, "thern</w>": 3137, "crow": 3138, "mr": 3139, "color</w>": 3140, "onto</w>": 3141, "pick</w>": 3142, "clear</w>": 3143, "dark</w>": 3144, "tac": 3145, "wanted</w>": 3146, "itting</w>": 3147, "cancer</w>": 3148, "government</w>": 3149, "die": 3150, "rise</w>": 3151, "zing</w>": 3152, "cold</w>": 3153, "foun": 3154, "studio</w>": 3155, "stration</w>": 3156, "brother</w>": 3157, "ahead</w>": 3158, "shel": 3159, "micro": 3160, "ically</w>": 3161, "dau": 3162, "signed</w>": 3163, "viol": 3164, "ax": 3165, "asse": 3166, "io": 3167, "wre": 3168, "splay</w>": 3169, "chick": 3170, "august</w>": 3171, "plat": 3172, "tips</w>": 3173, "spi": 3174, "human": 3175, "easy</w>": 3176, "logi": 3177, "mike</w>": 3178, "grow": 3179, "agre": 3180, "ww": 3181, "shad": 3182, "motiv": 3183, "wide</w>": 3184, "turns</w>": 3185, "omg</w>": 3186, "var": 3187, "defin": 3188, "sug": 3189, "jim": 3190, "ðŁĶ¥": 3191, "td</w>": 3192, "campaign</w>": 3193, "named</w>": 3194, "retweet</w>": 3195, "cop": 3196, "tv": 3197, "leav": 3198, "kis": 3199, "double</w>": 3200, "smar": 3201, "issue</w>": 3202, "villa": 3203, "information</w>": 3204, "lies</w>": 3205, "stock</w>": 3206, "nt</w>": 3207, "distric": 3208, "shor": 3209, "mix": 3210, "ero": 3211, "sep": 3212, "mex": 3213, "seeing</w>": 3214, "live": 3215, "remin": 3216, "code</w>": 3217, "gur": 3218, "sc</w>": 3219, "wild</w>": 3220, "lun": 3221, "hood</w>": 3222, "spot": 3223, "father</w>": 3224, "forever</w>": 3225, "upd": 3226, "traf": 3227, "fly</w>": 3228, "need": 3229, "gradu": 3230, "train</w>": 3231, "make": 3232, "sab": 3233, "bey": 3234, "size</w>": 3235, "leader</w>": 3236, "talks</w>": 3237, "eu</w>": 3238, "log": 3239, "fox</w>": 3240, "gorgeous</w>": 3241, "less": 3242, "lets</w>": 3243, "surpri": 3244, "myself</w>": 3245, "note</w>": 3246, "lives</w>": 3247, "fru": 3248, "loved</w>": 3249, "sever": 3250, "dem": 3251, "ji": 3252, "soc": 3253, "hold</w>": 3254, "dogs</w>": 3255, "ni</w>": 3256, "âŀ": 3257, "leave</w>": 3258, "airport</w>": 3259, "benef": 3260, "expl": 3261, "ships</w>": 3262, "complete</w>": 3263, "achi": 3264, "great": 3265, "vintage</w>": 3266, "jack</w>": 3267, "roc": 3268, "wood": 3269, "priv": 3270, "offer</w>": 3271, "eye</w>": 3272, "version</w>": 3273, "tea</w>": 3274, "coach": 3275, "offic": 3276, "well": 3277, "gen</w>": 3278, "sat</w>": 3279, "hh": 3280, "youth</w>": 3281, "ox": 3282, "?\"</w>": 3283, "mt</w>": 3284, "mix</w>": 3285, "gg</w>": 3286, "dle</w>": 3287, "natural</w>": 3288, "build</w>": 3289, "breakfast</w>": 3290, "thinking</w>": 3291, "theatre</w>": 3292, "moon</w>": 3293, "berg</w>": 3294, "goals</w>": 3295, "george</w>": 3296, "ene": 3297, "excell": 3298, "iling</w>": 3299, "tune</w>": 3300, "yed</w>": 3301, "gate</w>": 3302, "mit": 3303, "network</w>": 3304, "joe</w>": 3305, "hello</w>": 3306, "fb</w>": 3307, "tube</w>": 3308, "wearing</w>": 3309, "athle": 3310, "struc": 3311, "hard": 3312, "glass</w>": 3313, "gers</w>": 3314, "throw": 3315, "ges</w>": 3316, "bt": 3317, "industry</w>": 3318, "management</w>": 3319, "alist</w>": 3320, "goal</w>": 3321, "stream</w>": 3322, "yel": 3323, "avi": 3324, "icious</w>": 3325, "others</w>": 3326, "ski": 3327, "christi": 3328, "bird</w>": 3329, "esc": 3330, "min</w>": 3331, "tro</w>": 3332, "lt</w>": 3333, "jan</w>": 3334, "imp": 3335, "rights</w>": 3336, "sha</w>": 3337, "organ": 3338, "central</w>": 3339, "ara</w>": 3340, "roll</w>": 3341, "favourite</w>": 3342, "chester</w>": 3343, "else</w>": 3344, "pay</w>": 3345, "cars</w>": 3346, "mine</w>": 3347, "step</w>": 3348, "practice</w>": 3349, "major</w>": 3350, "hang": 3351, "ðŁĺĺ</w>": 3352, "non</w>": 3353, "vari": 3354, "engine": 3355, "volun": 3356, "dia</w>": 3357, "iled</w>": 3358, "architec": 3359, "pink</w>": 3360, "ds": 3361, "thy</w>": 3362, "wash": 3363, "website</w>": 3364, "bag</w>": 3365, "control</w>": 3366, "elli": 3367, "fra": 3368, "answ": 3369, "dence</w>": 3370, "yu": 3371, "ron</w>": 3372, "ola</w>": 3373, "gin": 3374, "drin": 3375, "lic</w>": 3376, "couple</w>": 3377, "spar": 3378, "gon</w>": 3379, "create</w>": 3380, "ct": 3381, "celebrating</w>": 3382, "deep</w>": 3383, "eat</w>": 3384, "tee</w>": 3385, "voice</w>": 3386, "drop</w>": 3387, "visit": 3388, "ators</w>": 3389, "stadium</w>": 3390, "ft": 3391, "wis": 3392, "rol": 3393, "grade</w>": 3394, "famil": 3395, "points</w>": 3396, "repre": 3397, "was": 3398, "traffic</w>": 3399, "japan</w>": 3400, "org": 3401, "honor</w>": 3402, "texas</w>": 3403, "manu": 3404, "âĻ¥</w>": 3405, "safety</w>": 3406, "rer</w>": 3407, "bag": 3408, "emplo": 3409, "released</w>": 3410, "regu": 3411, "aka</w>": 3412, "nav": 3413, "role</w>": 3414, "senior</w>": 3415, "spect</w>": 3416, "cross</w>": 3417, "lines</w>": 3418, "best": 3419, "pack</w>": 3420, "sin</w>": 3421, "tie</w>": 3422, "missing</w>": 3423, "sunset</w>": 3424, "liber": 3425, "ising</w>": 3426, "jay": 3427, "ski</w>": 3428, "championship</w>": 3429, "activ": 3430, "ladies</w>": 3431, "played</w>": 3432, "yy": 3433, "publ": 3434, "alo": 3435, "pride</w>": 3436, "sr": 3437, "paki": 3438, "lux": 3439, "survi": 3440, "cked</w>": 3441, "ets</w>": 3442, "chocol": 3443, "australia</w>": 3444, "paris</w>": 3445, "miles</w>": 3446, "hat": 3447, "mental</w>": 3448, "ala</w>": 3449, "mean</w>": 3450, "mobile</w>": 3451, "ena</w>": 3452, "insi": 3453, "found": 3454, "chief</w>": 3455, "tag": 3456, "incredible</w>": 3457, "return</w>": 3458, "Ã©": 3459, "google</w>": 3460, "french</w>": 3461, "crew</w>": 3462, "hallo": 3463, "alian</w>": 3464, "jaz": 3465, "cher</w>": 3466, "silver</w>": 3467, "north": 3468, "english</w>": 3469, "baseball</w>": 3470, "caf": 3471, "limited</w>": 3472, "following</w>": 3473, "appreci": 3474, "earth</w>": 3475, "kir": 3476, "vember</w>": 3477, "wed</w>": 3478, "ption</w>": 3479, "ged</w>": 3480, "october</w>": 3481, "flori": 3482, "cr</w>": 3483, "ency</w>": 3484, "gave</w>": 3485, "lord</w>": 3486, "stuff</w>": 3487, "berry</w>": 3488, "post": 3489, "smile</w>": 3490, "broad": 3491, "state": 3492, "gger</w>": 3493, "means</w>": 3494, "icy</w>": 3495, "gun</w>": 3496, "yo</w>": 3497, "master</w>": 3498, "burg</w>": 3499, "hands</w>": 3500, "nie</w>": 3501, "//</w>": 3502, "union</w>": 3503, "british</w>": 3504, "biggest</w>": 3505, "district</w>": 3506, "aming</w>": 3507, "hil": 3508, "oce": 3509, "person": 3510, "pass</w>": 3511, "envir": 3512, "schools</w>": 3513, "arrived</w>": 3514, "ances</w>": 3515, "inspired</w>": 3516, "expla": 3517, "ben</w>": 3518, "library</w>": 3519, "bott": 3520, "amp": 3521, "steph": 3522, "contact</w>": 3523, "bang": 3524, "ms": 3525, "califor": 3526, "told</w>": 3527, "battle</w>": 3528, "bb</w>": 3529, "chicago</w>": 3530, "âľ¨</w>": 3531, "strate": 3532, "shi</w>": 3533, "dece": 3534, "-)</w>": 3535, "add</w>": 3536, "lab": 3537, "jones</w>": 3538, "legend</w>": 3539, "castle</w>": 3540, "inger</w>": 3541, "stance</w>": 3542, "bel</w>": 3543, "ura</w>": 3544, "refu": 3545, "leaders</w>": 3546, "pot": 3547, "sex": 3548, "hic": 3549, "article</w>": 3550, "kid</w>": 3551, "france</w>": 3552, "xx</w>": 3553, "exe": 3554, "guide</w>": 3555, "volunte": 3556, "print</w>": 3557, "ali</w>": 3558, "ceo</w>": 3559, "tweets</w>": 3560, "wx</w>": 3561, "scene</w>": 3562, "volu": 3563, "anti</w>": 3564, "han</w>": 3565, "associ": 3566, "sharing</w>": 3567, "rose</w>": 3568, "minister</w>": 3569, "sher": 3570, "inste": 3571, "clean": 3572, "democr": 3573, "poster</w>": 3574, "skin</w>": 3575, "psy": 3576, "proper": 3577, "crazy</w>": 3578, "iam": 3579, "ore": 3580, "ini</w>": 3581, "anything</w>": 3582, "pod": 3583, "moving</w>": 3584, "click</w>": 3585, "explo": 3586, "comb": 3587, "craft</w>": 3588, "fi</w>": 3589, "blood</w>": 3590, "isra": 3591, "public": 3592, "dent": 3593, "olym": 3594, "england</w>": 3595, "asi": 3596, "cher": 3597, "fact</w>": 3598, "environ": 3599, "harry</w>": 3600, "gone</w>": 3601, "medic": 3602, "enjoying</w>": 3603, "justice</w>": 3604, "jr</w>": 3605, "indian</w>": 3606, "wife</w>": 3607, "sound</w>": 3608, "tes</w>": 3609, "drawing</w>": 3610, "pal</w>": 3611, "idea</w>": 3612, "crit": 3613, "juli": 3614, "iler</w>": 3615, "warm</w>": 3616, "clar": 3617, "thoughts</w>": 3618, "defen": 3619, "council</w>": 3620, "introduc": 3621, "died</w>": 3622, "janu": 3623, "ani</w>": 3624, "send</w>": 3625, "lier</w>": 3626, "ml": 3627, "interesting</w>": 3628, "trade</w>": 3629, "wind</w>": 3630, "bay": 3631, "sac": 3632, "ancy</w>": 3633, "source</w>": 3634, "bes</w>": 3635, "organi": 3636, "arly</w>": 3637, "large</w>": 3638, "ffici": 3639, "tag</w>": 3640, "ut</w>": 3641, "desp": 3642, "oes</w>": 3643, "title</w>": 3644, "sym": 3645, "pictures</w>": 3646, "open": 3647, "women": 3648, "showing</w>": 3649, "ria</w>": 3650, "least</w>": 3651, "leadership</w>": 3652, "current</w>": 3653, "electr": 3654, "valent": 3655, "listening</w>": 3656, "ckey</w>": 3657, "general</w>": 3658, "deser": 3659, "duce</w>": 3660, ";)</w>": 3661, "cent</w>": 3662, "ðŁĺįðŁĺį": 3663, "scott</w>": 3664, "poor</w>": 3665, "selfie</w>": 3666, "events</w>": 3667, "ion</w>": 3668, "wrong</w>": 3669, "dev</w>": 3670, "hill": 3671, "septe": 3672, "culture</w>": 3673, "line": 3674, "sorry</w>": 3675, "sent</w>": 3676, "sister</w>": 3677, "cept</w>": 3678, "kri": 3679, "november</w>": 3680, "ari</w>": 3681, "announce</w>": 3682, "zation</w>": 3683, "bran": 3684, "gent": 3685, "du</w>": 3686, "len": 3687, "pers": 3688, "fm</w>": 3689, "martin</w>": 3690, "op</w>": 3691, "emb": 3692, "ome": 3693, "middle</w>": 3694, "success": 3695, "peter</w>": 3696, "january</w>": 3697, "flu": 3698, "racing</w>": 3699, "dav": 3700, "bike</w>": 3701, "ðŁı»</w>": 3702, "pet</w>": 3703, "shoot</w>": 3704, "professi": 3705, "featuring</w>": 3706, "september</w>": 3707, "nowplaying</w>": 3708, "staur": 3709, "za": 3710, "onic</w>": 3711, "quick</w>": 3712, "baske": 3713, "speaking</w>": 3714, "milit": 3715, "zer</w>": 3716, "chicken</w>": 3717, "bell</w>": 3718, "sad</w>": 3719, "coast</w>": 3720, "loving</w>": 3721, "yers</w>": 3722, "dj</w>": 3723, "panel</w>": 3724, "verage</w>": 3725, "swit": 3726, "icks</w>": 3727, "bou": 3728, "california</w>": 3729, "sam</w>": 3730, "parents</w>": 3731, "ero</w>": 3732, "killed</w>": 3733, "phys": 3734, "jobs</w>": 3735, "migr": 3736, "anth": 3737, "emo": 3738, "halloween</w>": 3739, "ander": 3740, "cm</w>": 3741, "competition</w>": 3742, "eag": 3743, "sket": 3744, "spir": 3745, "maybe</w>": 3746, "exclusive</w>": 3747, "appe": 3748, "journey</w>": 3749, "screen</w>": 3750, "ford": 3751, "io</w>": 3752, "hate</w>": 3753, "ug": 3754, "soul</w>": 3755, "hero</w>": 3756, "society</w>": 3757, "syn": 3758, "guit": 3759, "nh": 3760, "dj": 3761, "ases</w>": 3762, "impre": 3763, "time": 3764, "sales</w>": 3765, "dd</w>": 3766, "fts</w>": 3767, "summit</w>": 3768, "stunning</w>": 3769, "oms</w>": 3770, "turned</w>": 3771, "clean</w>": 3772, "soft</w>": 3773, "beat": 3774, "restaur": 3775, "dered</w>": 3776, "ences</w>": 3777, "magic</w>": 3778, "dio": 3779, "shine</w>": 3780, "guest</w>": 3781, "healthy</w>": 3782, "exhib": 3783, "stories</w>": 3784, "popu": 3785, "nis</w>": 3786, "ela</w>": 3787, "below</w>": 3788, "funny</w>": 3789, "results</w>": 3790, "sne": 3791, "currently</w>": 3792, "ard": 3793, "download</w>": 3794, "flight</w>": 3795, "mal</w>": 3796, "fine</w>": 3797, "pad": 3798, "chu": 3799, "ented</w>": 3800, "hat</w>": 3801, "ðŁĳı": 3802, "steve</w>": 3803, "jo</w>": 3804, "mark": 3805, "rat": 3806, "ball": 3807, "pc</w>": 3808, "pon": 3809, "bby</w>": 3810, "oli": 3811, "arts</w>": 3812, "asure</w>": 3813, "bowl</w>": 3814, "attack</w>": 3815, "mic</w>": 3816, "dear</w>": 3817, "range</w>": 3818, "enter</w>": 3819, "chocolate</w>": 3820, "brilli": 3821, "access</w>": 3822, ",\"</w>": 3823, "???</w>": 3824, "chap": 3825, "const": 3826, "tn": 3827, "matter</w>": 3828, "blue": 3829, "gallery</w>": 3830, "emp": 3831, "workshop</w>": 3832, "leading</w>": 3833, "yours</w>": 3834, "basketball</w>": 3835, "wanna</w>": 3836, "thu": 3837, "__</w>": 3838, "marri": 3839, "sleep</w>": 3840, "bia</w>": 3841, "che</w>": 3842, "mad</w>": 3843, "impact</w>": 3844, "own": 3845, "sir</w>": 3846, "channel</w>": 3847, "europe</w>": 3848, "esp": 3849, "kitch": 3850, "hospital</w>": 3851, "wra": 3852, "royal</w>": 3853, "fs</w>": 3854, "neu": 3855, "quar": 3856, "ney": 3857, "acks</w>": 3858, "chase</w>": 3859, "ppy</w>": 3860, "stal": 3861, "ately</w>": 3862, "tim</w>": 3863, "december</w>": 3864, "rare</w>": 3865, "perform": 3866, "cream</w>": 3867, "weight</w>": 3868, "choo": 3869, "night": 3870, "haven</w>": 3871, "franc": 3872, "khan</w>": 3873, "built</w>": 3874, "helping</w>": 3875, "trust</w>": 3876, "type</w>": 3877, "golden</w>": 3878, "tax</w>": 3879, "snow": 3880, "swi": 3881, "disa": 3882, "questions</w>": 3883, "vey</w>": 3884, "light": 3885, "cn": 3886, "cloud</w>": 3887, "thomas</w>": 3888, "aged</w>": 3889, "shou": 3890, "teams</w>": 3891, "gran": 3892, "reason</w>": 3893, "aa</w>": 3894, "youtube</w>": 3895, "vp</w>": 3896, "pizz": 3897, "manager</w>": 3898, "bury</w>": 3899, "credit</w>": 3900, "treat</w>": 3901, "max</w>": 3902, "ik": 3903, "main": 3904, "ging</w>": 3905, "dead": 3906, "probab": 3907, "yeah</w>": 3908, "ãĤ": 3909, "brand": 3910, "soli": 3911, "plant</w>": 3912, "tayl": 3913, "girl": 3914, "ðŁĺŃ</w>": 3915, "nament</w>": 3916, "auto": 3917, "message</w>": 3918, "kore": 3919, "nur": 3920, "terr": 3921, "agu": 3922, "map</w>": 3923, "senting</w>": 3924, "loves</w>": 3925, "gives</w>": 3926, "gab": 3927, "zen</w>": 3928, "robert</w>": 3929, "confir": 3930, "wars</w>": 3931, "om</w>": 3932, "stain": 3933, "camera</w>": 3934, "ander</w>": 3935, "wonder</w>": 3936, "ab</w>": 3937, "cap</w>": 3938, "sold</w>": 3939, "suit</w>": 3940, "walking</w>": 3941, "continue</w>": 3942, "effec": 3943, "daughter</w>": 3944, "danc": 3945, "chain</w>": 3946, "multi": 3947, "kid": 3948, "yan": 3949, "champion</w>": 3950, "vo</w>": 3951, "tains</w>": 3952, "host</w>": 3953, "mini</w>": 3954, "missed</w>": 3955, "resc": 3956, "lyn": 3957, "finish</w>": 3958, "delicious</w>": 3959, "sas": 3960, "taylor</w>": 3961, "ib": 3962, "promis": 3963, "products</w>": 3964, "mountain</w>": 3965, "florida</w>": 3966, "register</w>": 3967, "treat": 3968, "recent</w>": 3969, "female</w>": 3970, "booth</w>": 3971, "matt</w>": 3972, "vehic": 3973, "sop": 3974, "motor": 3975, "supporting</w>": 3976, "phic</w>": 3977, "extre": 3978, "drink</w>": 3979, "lane</w>": 3980, "third</w>": 3981, "ps": 3982, "constru": 3983, "cere": 3984, "farm</w>": 3985, "ðŁİī</w>": 3986, "tured</w>": 3987, "ðŁĳī</w>": 3988, "cats</w>": 3989, "aj": 3990, "gie</w>": 3991, "shooting</w>": 3992, "asked</w>": 3993, "pakistan</w>": 3994, "ame": 3995, "mb</w>": 3996, "gil": 3997, "legal</w>": 3998, "square</w>": 3999, "invol": 4000, "draw</w>": 4001, "oooo": 4002, "!!!!": 4003, "opportunity</w>": 4004, "py": 4005, "ei": 4006, "bts</w>": 4007, "teacher</w>": 4008, "character</w>": 4009, "johnson</w>": 4010, "bron": 4011, "lywood</w>": 4012, "chine": 4013, "cing</w>": 4014, "cine": 4015, "dge": 4016, "gaming</w>": 4017, "russia</w>": 4018, "cia</w>": 4019, "quote</w>": 4020, "rich</w>": 4021, "gov": 4022, "flowers</w>": 4023, "spiri": 4024, "stin</w>": 4025, "growth</w>": 4026, "ðŁı¼</w>": 4027, "commer": 4028, "juni": 4029, "mum</w>": 4030, "ran</w>": 4031, "sna": 4032, "aren": 4033, "cb": 4034, "actor</w>": 4035, "color": 4036, "sit</w>": 4037, "pair</w>": 4038, "chi</w>": 4039, "bow</w>": 4040, "academy</w>": 4041, "held</w>": 4042, "rang": 4043, "metal</w>": 4044, "yl</w>": 4045, "active</w>": 4046, "probably</w>": 4047, "tch</w>": 4048, "needed</w>": 4049, "spee": 4050, "choice</w>": 4051, "italy</w>": 4052, "ryan</w>": 4053, "ðŁĩº": 4054, "flower</w>": 4055, "vit": 4056, "mn</w>": 4057, "foundation</w>": 4058, "bak": 4059, "sions</w>": 4060, "neigh": 4061, "floo": 4062, "heard</w>": 4063, "remo": 4064, "fresh": 4065, "inging</w>": 4066, "ref": 4067, "town": 4068, "clou": 4069, "jesus</w>": 4070, "spirit</w>": 4071, "couldn</w>": 4072, "zes</w>": 4073, "ðŁĴĻ</w>": 4074, "williams</w>": 4075, "proce": 4076, "modern</w>": 4077, "process</w>": 4078, "shoes</w>": 4079, "created</w>": 4080, "tric</w>": 4081, "issues</w>": 4082, "anne</w>": 4083, "atten": 4084, "debut</w>": 4085, "hr</w>": 4086, "nit": 4087, "stig": 4088, "apo": 4089, "eps</w>": 4090, "zu": 4091, "ãĢ": 4092, "six</w>": 4093, "cards</w>": 4094, "langu": 4095, "famous</w>": 4096, "tournament</w>": 4097, "sel</w>": 4098, "ebay</w>": 4099, "yn</w>": 4100, "ston": 4101, "kick": 4102, "announced</w>": 4103, "kam": 4104, "voc": 4105, "brilliant</w>": 4106, "house": 4107, "cheese</w>": 4108, "warri": 4109, "music": 4110, "hockey</w>": 4111, "ðŁĺĤðŁĺĤ</w>": 4112, "skills</w>": 4113, "autom": 4114, "smart</w>": 4115, "medical</w>": 4116, "mony</w>": 4117, "ex</w>": 4118, "guar": 4119, "give": 4120, "personal</w>": 4121, "vention</w>": 4122, "alli": 4123, "press": 4124, "floor</w>": 4125, "mc</w>": 4126, "victory</w>": 4127, "him": 4128, "simple</w>": 4129, "thor": 4130, "ðŁĩºðŁĩ": 4131, "tail</w>": 4132, "lucky</w>": 4133, "alex</w>": 4134, "quite</w>": 4135, "bot": 4136, "ssions</w>": 4137, "challeng": 4138, "cann": 4139, "amazon</w>": 4140, "hell</w>": 4141, "bought</w>": 4142, "):</w>": 4143, "edy</w>": 4144, "secret</w>": 4145, "production</w>": 4146, "independ": 4147, "defe": 4148, "added</w>": 4149, "pr</w>": 4150, "pag": 4151, "bed": 4152, "greatest</w>": 4153, "within</w>": 4154, "jay</w>": 4155, "ðŁ¥": 4156, "ireland</w>": 4157, "rely</w>": 4158, "sd</w>": 4159, "text</w>": 4160, "driving</w>": 4161, "program": 4162, "speed</w>": 4163, "colum": 4164, "stron": 4165, "Ã©</w>": 4166, "forest</w>": 4167, "âĸ": 4168, "machine</w>": 4169, "coin</w>": 4170, "scar": 4171, "ount</w>": 4172, "bie</w>": 4173, "¡ï¸ı</w>": 4174, "portra": 4175, "common</w>": 4176, "wrest": 4177, "received</w>": 4178, "know": 4179, "invest": 4180, "plans</w>": 4181, "accor": 4182, "adop": 4183, "tery</w>": 4184, "reali": 4185, "pp</w>": 4186, "kal": 4187, "artwork</w>": 4188, "mean": 4189, "god": 4190, "instead</w>": 4191, "anci": 4192, "motivation</w>": 4193, "asing</w>": 4194, "inspiration</w>": 4195, "upcoming</w>": 4196, "political</w>": 4197, "europe": 4198, "mers</w>": 4199, "heavy</w>": 4200, "ðŁĳį</w>": 4201, "febru": 4202, "scotland</w>": 4203, "ough": 4204, "bt</w>": 4205, "boss</w>": 4206, "schedu": 4207, "speak</w>": 4208, "nick": 4209, "ured</w>": 4210, "ino": 4211, "ek": 4212, "risk</w>": 4213, "tory</w>": 4214, "presents</w>": 4215, "bon</w>": 4216, "rug": 4217, "states</w>": 4218, "exhibition</w>": 4219, "ilo": 4220, "mill": 4221, "brought</w>": 4222, ":-)</w>": 4223, "touri": 4224, "come": 4225, "officially</w>": 4226, "champions</w>": 4227, "doors</w>": 4228, "rep": 4229, "pose</w>": 4230, "extra</w>": 4231, "kings</w>": 4232, "soccer</w>": 4233, "squad</w>": 4234, "applic": 4235, "ata</w>": 4236, "sometimes</w>": 4237, "tari": 4238, "excellent</w>": 4239, "ðŁĺĺ": 4240, "straight</w>": 4241, "carol": 4242, "rip</w>": 4243, "âĢį": 4244, "graphic</w>": 4245, "mol": 4246, "election</w>": 4247, "february</w>": 4248, "asons</w>": 4249, "li</w>": 4250, "dir": 4251, "mt": 4252, "nick</w>": 4253, "usu": 4254, "mrs</w>": 4255, "comics</w>": 4256, "institu": 4257, "corpor": 4258, "vi</w>": 4259, "ðŁĻı": 4260, "tural</w>": 4261, "dise": 4262, "acci": 4263, "weare": 4264, "among</w>": 4265, "shopping</w>": 4266, "till</w>": 4267, "what": 4268, "chair</w>": 4269, "span": 4270, "chinese</w>": 4271, "innovation</w>": 4272, "joy</w>": 4273, "kit</w>": 4274, "century</w>": 4275, "obama</w>": 4276, "phili": 4277, "fc": 4278, "reach</w>": 4279, "citi": 4280, "ulous</w>": 4281, "non": 4282, "dang": 4283, "happening</w>": 4284, "burn</w>": 4285, "pel": 4286, "orange</w>": 4287, "dv": 4288, "kick</w>": 4289, "claim": 4290, "ingham</w>": 4291, "phy</w>": 4292, "nov</w>": 4293, "podcast</w>": 4294, "whi": 4295, "nights</w>": 4296, "earlier</w>": 4297, "bear</w>": 4298, "lah</w>": 4299, "exciting</w>": 4300, "ora</w>": 4301, "given</w>": 4302, "slo": 4303, "memories</w>": 4304, "continues</w>": 4305, "product</w>": 4306, "gho": 4307, "cd": 4308, "knows</w>": 4309, "ðŁİī": 4310, "published</w>": 4311, "discuss</w>": 4312, "yard</w>": 4313, "iphone</w>": 4314, "tries</w>": 4315, "wall": 4316, "feb</w>": 4317, "aren</w>": 4318, "truth</w>": 4319, "winners</w>": 4320, "ture": 4321, "ditional</w>": 4322, "military</w>": 4323, "problem</w>": 4324, "mand": 4325, "dog": 4326, "loss</w>": 4327, "cric": 4328, "canadi": 4329, "veter": 4330, "village</w>": 4331, "\",</w>": 4332, "yr</w>": 4333, "ung</w>": 4334, "donald</w>": 4335, "aging</w>": 4336, "birds</w>": 4337, "scienti": 4338, "les": 4339, "this": 4340, "region</w>": 4341, "tical</w>": 4342, "itten</w>": 4343, "ila</w>": 4344, "ðŁĺİ</w>": 4345, "dad": 4346, "diam": 4347, "above</w>": 4348, "stren": 4349, "lit</w>": 4350, "pir": 4351, "lab</w>": 4352, "focus</w>": 4353, "busy</w>": 4354, "dur": 4355, "apply</w>": 4356, "sma": 4357, "author</w>": 4358, "aci": 4359, "execu": 4360, "domin": 4361, "rela": 4362, "jackson</w>": 4363, "ato</w>": 4364, "washington</w>": 4365, "ðŁĻĮ": 4366, "kill</w>": 4367, "popular</w>": 4368, "cement</w>": 4369, "road": 4370, "eating</w>": 4371, "location</w>": 4372, "vent": 4373, "arre": 4374, "nan": 4375, "custo": 4376, "adventure</w>": 4377, "ordin": 4378, "sport": 4379, "ult</w>": 4380, "lock</w>": 4381, "question</w>": 4382, "driver</w>": 4383, "landsc": 4384, "oni": 4385, "kins</w>": 4386, "pd": 4387, "jordan</w>": 4388, "tered</w>": 4389, "kk": 4390, "af</w>": 4391, "child": 4392, "sp</w>": 4393, "justin</w>": 4394, "eni": 4395, "selling</w>": 4396, "zo": 4397, "whit": 4398, "boston</w>": 4399, "particip": 4400, "signing</w>": 4401, "happened</w>": 4402, "heat</w>": 4403, "mam": 4404, "dreams</w>": 4405, "lows</w>": 4406, "graph</w>": 4407, "theday</w>": 4408, "heading</w>": 4409, "bro</w>": 4410, "blessed</w>": 4411, "vic</w>": 4412, "vegas</w>": 4413, "hd</w>": 4414, "inning</w>": 4415, "roman": 4416, "andro": 4417, "denti": 4418, "use": 4419, "cit": 4420, "progress</w>": 4421, "writer</w>": 4422, "bob</w>": 4423, "ffs</w>": 4424, "growing</w>": 4425, "bly</w>": 4426, "aware": 4427, "exam": 4428, "spent</w>": 4429, "bet</w>": 4430, "score</w>": 4431, "beyond</w>": 4432, "docu": 4433, "adel": 4434, "sf": 4435, "coura": 4436, "collabor": 4437, "inc</w>": 4438, "private</w>": 4439, "boat</w>": 4440, "**</w>": 4441, "zone</w>": 4442, "pha": 4443, "bill": 4444, "total</w>": 4445, "planning</w>": 4446, "towards</w>": 4447, "places</w>": 4448, "preview</w>": 4449, "creative</w>": 4450, "damn</w>": 4451, "ideas</w>": 4452, "seems</w>": 4453, "poten": 4454, "saying</w>": 4455, "display</w>": 4456, "sw</w>": 4457, "aqu": 4458, "louis</w>": 4459, "bye</w>": 4460, "lil</w>": 4461, "email</w>": 4462, "western</w>": 4463, "germany</w>": 4464, "eller</w>": 4465, "res": 4466, "fant": 4467, "mentary</w>": 4468, "deals</w>": 4469, "richard</w>": 4470, "jersey</w>": 4471, "streng": 4472, "rad": 4473, "pizza</w>": 4474, "mond</w>": 4475, "ware</w>": 4476, "lac": 4477, "gi</w>": 4478, "archi": 4479, "cd</w>": 4480, "yellow</w>": 4481, "recently</w>": 4482, "reach": 4483, "à¹": 4484, "kitchen</w>": 4485, "designed</w>": 4486, "try": 4487, "gal</w>": 4488, "restaurant</w>": 4489, "ature</w>": 4490, "ww</w>": 4491, "jas": 4492, "lma": 4493, "ðŁĳĮ</w>": 4494, "pain</w>": 4495, "avo": 4496, "minute</w>": 4497, "schol": 4498, "therap": 4499, "ticket</w>": 4500, "dry</w>": 4501, "japan": 4502, "ditions</w>": 4503, "terri": 4504, "selves</w>": 4505, "happen</w>": 4506, "tup</w>": 4507, "mag</w>": 4508, "copy</w>": 4509, "sher</w>": 4510, "freedom</w>": 4511, "file</w>": 4512, "specially</w>": 4513, "toronto</w>": 4514, "load": 4515, "gary</w>": 4516, "rey</w>": 4517, "answer</w>": 4518, "loy": 4519, "caught</w>": 4520, "prize</w>": 4521, "une": 4522, "fication</w>": 4523, "niger": 4524, "syd": 4525, "touch</w>": 4526, "feature</w>": 4527, "jazz</w>": 4528, "records</w>": 4529, "himself</w>": 4530, "dish</w>": 4531, "rober": 4532, "spotted</w>": 4533, "master": 4534, "wave</w>": 4535, "finals</w>": 4536, "bull": 4537, "forum</w>": 4538, "ald": 4539, "recomm": 4540, "cha</w>": 4541, "ae</w>": 4542, "doo": 4543, "instru": 4544, "truly</w>": 4545, "lg": 4546, "ink": 4547, "brothers</w>": 4548, "dest</w>": 4549, "jim</w>": 4550, "mit</w>": 4551, "closed</w>": 4552, "ison</w>": 4553, "tried</w>": 4554, "santa</w>": 4555, "affe": 4556, "wan</w>": 4557, "horse</w>": 4558, "grow</w>": 4559, "campus</w>": 4560, "relation": 4561, "native</w>": 4562, "journ": 4563, "gov</w>": 4564, "oct</w>": 4565, "kit": 4566, "bound</w>": 4567, "partner</w>": 4568, "rema": 4569, "crowd</w>": 4570, "!)</w>": 4571, "calls</w>": 4572, "rail": 4573, "quali": 4574, "solution</w>": 4575, "contest</w>": 4576, "convers": 4577, "snap": 4578, "base</w>": 4579, "initi": 4580, "tax": 4581, "ye</w>": 4582, "entrepre": 4583, "itor</w>": 4584, "construction</w>": 4585, "food": 4586, "presented</w>": 4587, "nings</w>": 4588, "climate</w>": 4589, "km</w>": 4590, "model": 4591, "bj": 4592, "block</w>": 4593, "presentation</w>": 4594, "dream": 4595, "fix": 4596, "calling</w>": 4597, "busine": 4598, "congress</w>": 4599, "understand</w>": 4600, "web</w>": 4601, "value</w>": 4602, "ï¸ıâĥ£</w>": 4603, "mexico</w>": 4604, "itely</w>": 4605, "kim</w>": 4606, "charity</w>": 4607, "reflec": 4608, "blan": 4609, "flying</w>": 4610, "analy": 4611, "families</w>": 4612, "band": 4613, "recipe</w>": 4614, "celebration</w>": 4615, "accep": 4616, "ary": 4617, "tot": 4618, "gb</w>": 4619, "interested</w>": 4620, "captain</w>": 4621, "âĻ¥": 4622, "tip</w>": 4623, "absol": 4624, "braz": 4625, "investig": 4626, "ology</w>": 4627, "dec</w>": 4628, "truck</w>": 4629, "vering</w>": 4630, "clear": 4631, "dont</w>": 4632, "gotta</w>": 4633, "advis": 4634, "begins</w>": 4635, "mass": 4636, "descri": 4637, "block": 4638, "kim": 4639, "david": 4640, "songs</w>": 4641, "memorial</w>": 4642, "features</w>": 4643, "sustain": 4644, "'.</w>": 4645, "grab</w>": 4646, "jose": 4647, "va": 4648, "conserv": 4649, "sets</w>": 4650, "manchester</w>": 4651, "fighting</w>": 4652, "degre": 4653, "aga</w>": 4654, "ind</w>": 4655, "sleep": 4656, "position</w>": 4657, "hair": 4658, "signs</w>": 4659, "policy</w>": 4660, "ito</w>": 4661, "alert</w>": 4662, "stam": 4663, "spend</w>": 4664, "wy": 4665, "absolut": 4666, "dm</w>": 4667, "animal</w>": 4668, "myster": 4669, "successful</w>": 4670, "problems</w>": 4671, "robo": 4672, "kay": 4673, "garden": 4674, "pd</w>": 4675, "mayor</w>": 4676, "dale</w>": 4677, "tol": 4678, "offers</w>": 4679, "visiting</w>": 4680, "friendly</w>": 4681, "trees</w>": 4682, "officer</w>": 4683, "account</w>": 4684, "kevin</w>": 4685, "ðŁĳį": 4686, "giant</w>": 4687, "continu": 4688, "consu": 4689, "tract</w>": 4690, "nfl</w>": 4691, "ðŁĺĬ": 4692, "hq</w>": 4693, "bility</w>": 4694, "aar": 4695, "disney</w>": 4696, "teen</w>": 4697, "oned</w>": 4698, "white": 4699, "trailer</w>": 4700, "dedic": 4701, "alone</w>": 4702, "absolutely</w>": 4703, "digital": 4704, "william</w>": 4705, "ination</w>": 4706, "swa": 4707, "ee</w>": 4708, "entire</w>": 4709, "german</w>": 4710, "roll": 4711, "hits</w>": 4712, "cost</w>": 4713, "stay": 4714, "tha</w>": 4715, "alive</w>": 4716, "according</w>": 4717, "cot": 4718, "literally</w>": 4719, "herit": 4720, "reti": 4721, "hahaha</w>": 4722, "experi": 4723, "likes</w>": 4724, "gt</w>": 4725, "steel</w>": 4726, "____": 4727, "chair": 4728, "christian</w>": 4729, "tower</w>": 4730, "difference</w>": 4731, "md</w>": 4732, "tress</w>": 4733, "mid</w>": 4734, "prince</w>": 4735, "african</w>": 4736, "feder": 4737, "foot</w>": 4738, "carri": 4739, "served</w>": 4740, "rice</w>": 4741, "shall</w>": 4742, "featured</w>": 4743, "cker</w>": 4744, "recru": 4745, "poe": 4746, "sense</w>": 4747, "nific": 4748, "comedy</w>": 4749, "content</w>": 4750, "fat": 4751, "posted</w>": 4752, "contribu": 4753, "timate</w>": 4754, "liver": 4755, "mble</w>": 4756, "internet</w>": 4757, "age": 4758, "european</w>": 4759, "cling</w>": 4760, "glad</w>": 4761, "ffic": 4762, "sco</w>": 4763, "akes</w>": 4764, "elle</w>": 4765, "termin": 4766, "tony</w>": 4767, "pale": 4768, "colour</w>": 4769, "serious</w>": 4770, "patri": 4771, "movies</w>": 4772, "bm": 4773, "professional</w>": 4774, "ado": 4775, "alu": 4776, "bringing</w>": 4777, "falls</w>": 4778, "israel</w>": 4779, "term</w>": 4780, "language</w>": 4781, "brook": 4782, "mann</w>": 4783, "communic": 4784, "cannot</w>": 4785, "acti": 4786, "phe": 4787, "yan</w>": 4788, "entreprene": 4789, "turkey</w>": 4790, "logical</w>": 4791, "long": 4792, "arm</w>": 4793, "urs</w>": 4794, "workers</w>": 4795, "ingly</w>": 4796, "ggs</w>": 4797, "ric</w>": 4798, "tual</w>": 4799, "receive</w>": 4800, "opens</w>": 4801, "gear</w>": 4802, "social": 4803, "feet</w>": 4804, "cking</w>": 4805, "adver": 4806, "finan": 4807, "feels</w>": 4808, "spla": 4809, "hr": 4810, "easter</w>": 4811, "brain</w>": 4812, "ãģ": 4813, "fig": 4814, "ledge</w>": 4815, "nearly</w>": 4816, "protect</w>": 4817, "massive</w>": 4818, "eth": 4819, "awa": 4820, "ðŁĺģ</w>": 4821, "yrs</w>": 4822, "awareness</w>": 4823, "definitely</w>": 4824, "kn": 4825, "imagine</w>": 4826, "ku</w>": 4827, "systems</w>": 4828, "ðŁĳı</w>": 4829, "fas": 4830, "lik": 4831, "provide</w>": 4832, "amo": 4833, "discover</w>": 4834, "influ": 4835, "maker</w>": 4836, "gaz": 4837, "fitness</w>": 4838, "street": 4839, "ers": 4840, "ted": 4841, "wc": 4842, "ysis</w>": 4843, "positive</w>": 4844, "helped</w>": 4845, "quest</w>": 4846, "andrew</w>": 4847, "brad": 4848, "bin": 4849, "hanging</w>": 4850, "ling": 4851, "bright</w>": 4852, "section</w>": 4853, "mass</w>": 4854, "ðŁĻĮ</w>": 4855, "followers</w>": 4856, "hosting</w>": 4857, "tempor": 4858, "flag</w>": 4859, "ave</w>": 4860, "letter</w>": 4861, "kur": 4862, "requi": 4863, "often</w>": 4864, "cryp": 4865, "suff": 4866, "âļ½": 4867, "russian</w>": 4868, "treatment</w>": 4869, "alle": 4870, "hay": 4871, "lan</w>": 4872, "keeping</w>": 4873, "holy</w>": 4874, "powerful</w>": 4875, "predic": 4876, "fund</w>": 4877, "especially</w>": 4878, "window</w>": 4879, "jewel": 4880, "ily": 4881, "ðŁĴľ</w>": 4882, "generation</w>": 4883, "appa": 4884, "seriously</w>": 4885, "od": 4886, "ðŁĺĤðŁĺĤðŁĺĤ</w>": 4887, "certi": 4888, "irish</w>": 4889, "ðŁĳĮ": 4890, "miami</w>": 4891, "beth</w>": 4892, "vity</w>": 4893, "secu": 4894, "chef</w>": 4895, "crime</w>": 4896, "graphy</w>": 4897, "max": 4898, "artists</w>": 4899, "revolu": 4900, "guard</w>": 4901, "speech</w>": 4902, "uc": 4903, "updates</w>": 4904, "faces</w>": 4905, "stant</w>": 4906, "changed</w>": 4907, "reports</w>": 4908, "lower</w>": 4909, "pear": 4910, "nc</w>": 4911, "kil": 4912, "looked</w>": 4913, "speaker</w>": 4914, "sf</w>": 4915, "respect</w>": 4916, "okay</w>": 4917, "ocean</w>": 4918, "sitting</w>": 4919, "architecture</w>": 4920, "trail</w>": 4921, "seat</w>": 4922, "ira": 4923, "leg</w>": 4924, "japanese</w>": 4925, "dam</w>": 4926, "ular</w>": 4927, "swim": 4928, "politics</w>": 4929, "financial</w>": 4930, "old": 4931, "mouth</w>": 4932, "attemp": 4933, "destin": 4934, "fishing</w>": 4935, "attention</w>": 4936, "mem": 4937, "changes</w>": 4938, "decided</w>": 4939, "religi": 4940, "gin</w>": 4941, "cav": 4942, "zz</w>": 4943, "adam</w>": 4944, "mac</w>": 4945, "write</w>": 4946, "begin</w>": 4947, "scul": 4948, "alter": 4949, "iss</w>": 4950, "athon</w>": 4951, "images</w>": 4952, "moo": 4953, "joined</w>": 4954, "ðŁĺī</w>": 4955, "âŀ¡ï¸ı</w>": 4956, "passed</w>": 4957, "musli": 4958, "hir": 4959, "largest</w>": 4960, "camer": 4961, "comic</w>": 4962, "ghted</w>": 4963, "rugby</w>": 4964, "burgh</w>": 4965, "gging</w>": 4966, "testing</w>": 4967, "prepar": 4968, "laugh": 4969, "aled</w>": 4970, "improve</w>": 4971, "believ": 4972, "advice</w>": 4973, "shares</w>": 4974, "heart": 4975, "turning</w>": 4976, "sb</w>": 4977, "tel": 4978, "cafe</w>": 4979, "nes</w>": 4980, "daniel</w>": 4981, "patter": 4982, "tz</w>": 4983, "sett": 4984, "park": 4985, "cand": 4986, "stick</w>": 4987, "happens</w>": 4988, "brian</w>": 4989, "newest</w>": 4990, "epic</w>": 4991, "ador": 4992, "kies</w>": 4993, "warning</w>": 4994, "animals</w>": 4995, "custom</w>": 4996, "arc": 4997, "dian</w>": 4998, "gold": 4999, "core</w>": 5000, "tf</w>": 5001, "city": 5002, "pants</w>": 5003, "reality</w>": 5004, "confi": 5005, "inju": 5006, "fox": 5007, "guil": 5008, "knew</w>": 5009, "âĺº": 5010, "correc": 5011, "itude</w>": 5012, "dden</w>": 5013, ".#</w>": 5014, "reduc": 5015, "pass": 5016, "fon": 5017, "ya": 5018, "owner</w>": 5019, "returns</w>": 5020, "nc": 5021, "east": 5022, "apol": 5023, "insur": 5024, "tho</w>": 5025, "sim": 5026, "junior</w>": 5027, "bee</w>": 5028, "angel": 5029, "attle</w>": 5030, "electric</w>": 5031, "horror</w>": 5032, "crash</w>": 5033, "eye": 5034, "path</w>": 5035, "southern</w>": 5036, "employe": 5037, "geo": 5038, "tan</w>": 5039, "haz": 5040, "rally</w>": 5041, "ðŁı»": 5042, "property</w>": 5043, "wasn</w>": 5044, "enjoyed</w>": 5045, "grey</w>": 5046, "gas": 5047, "brew": 5048, "northern</w>": 5049, "holding</w>": 5050, "gp</w>": 5051, "take": 5052, "chart</w>": 5053, "lyn</w>": 5054, "drama</w>": 5055, "zo</w>": 5056, "paid</w>": 5057, "throwback</w>": 5058, "cup": 5059, "discussion</w>": 5060, "downtown</w>": 5061, "will": 5062, "lew": 5063, "bis": 5064, "tary</w>": 5065, "bread</w>": 5066, "upon</w>": 5067, "rate</w>": 5068, "teachers</w>": 5069, "itation</w>": 5070, "anced</w>": 5071, "cycle</w>": 5072, "choose</w>": 5073, "dc": 5074, "iran</w>": 5075, "cow": 5076, "dave</w>": 5077, "raise</w>": 5078, "princess</w>": 5079, "faith</w>": 5080, "-></w>": 5081, "industri": 5082, "spain</w>": 5083, "guitar</w>": 5084, "facts</w>": 5085, "mn": 5086, "spen": 5087, "courte": 5088, "gott": 5089, "projects</w>": 5090, "audi": 5091, "osc": 5092, "peter": 5093, "sand</w>": 5094, "interest</w>": 5095, "happiness</w>": 5096, "venue</w>": 5097, "soldi": 5098, "surprise</w>": 5099, "potential</w>": 5100, "perio": 5101, "customer</w>": 5102, "ii": 5103, "gni": 5104, "manufac": 5105, "eco": 5106, "broken</w>": 5107, "singer</w>": 5108, "vels</w>": 5109, "wales</w>": 5110, "hus": 5111, "inj": 5112, "four": 5113, "talent</w>": 5114, "dying</w>": 5115, "matthe": 5116, "film": 5117, "joining</w>": 5118, "sell</w>": 5119, "jar": 5120, "lmao</w>": 5121, "surger": 5122, "bbc": 5123, "sources</w>": 5124, "austin</w>": 5125, "nik": 5126, "charles</w>": 5127, "fam</w>": 5128, "princi": 5129, "angel</w>": 5130, "cash</w>": 5131, "lot": 5132, "ored</w>": 5133, "plays</w>": 5134, "plate</w>": 5135, "done": 5136, "memory</w>": 5137, "brings</w>": 5138, "nba</w>": 5139, "solutions</w>": 5140, "teaching</w>": 5141, "grace</w>": 5142, "circu": 5143, "helps</w>": 5144, "founder</w>": 5145, "mary": 5146, "explore</w>": 5147, "decor": 5148, "parts</w>": 5149, "cho</w>": 5150, "integr": 5151, "hau": 5152, "ises</w>": 5153, "putting</w>": 5154, "iner</w>": 5155, "rit": 5156, "vy</w>": 5157, "michel": 5158, "blues</w>": 5159, "everyday</w>": 5160, "forms</w>": 5161, "bio</w>": 5162, "year": 5163, "pin</w>": 5164, "tter</w>": 5165, "spring": 5166, "))</w>": 5167, "pot</w>": 5168, "aling</w>": 5169, "performing</w>": 5170, "shan": 5171, "planet</w>": 5172, "musical</w>": 5173, "heads</w>": 5174, "italian</w>": 5175, "strugg": 5176, "âĢįâĻ": 5177, "wings</w>": 5178, "pump": 5179, "hh</w>": 5180, "trou": 5181, "aid</w>": 5182, "prime</w>": 5183, "earth": 5184, "paint</w>": 5185, "mont": 5186, "amy</w>": 5187, "bbc</w>": 5188, "fabulous</w>": 5189, "fruit</w>": 5190, "android</w>": 5191, "bourne</w>": 5192, "ceremony</w>": 5193, "ential</w>": 5194, "??": 5195, "debate</w>": 5196, "oning</w>": 5197, "draft</w>": 5198, "solar</w>": 5199, "tx</w>": 5200, "jam</w>": 5201, "corn": 5202, "!!!!!</w>": 5203, "broo": 5204, "milk</w>": 5205, "posed</w>": 5206, "ohi": 5207, "movement</w>": 5208, "bren": 5209, "partner": 5210, "pg</w>": 5211, "ette</w>": 5212, "aries</w>": 5213, "shout</w>": 5214, "ng</w>": 5215, "leaving</w>": 5216, "tells</w>": 5217, "sens": 5218, "taste</w>": 5219, "kelly</w>": 5220, "worl": 5221, "gym</w>": 5222, "rich": 5223, "egy": 5224, "pid</w>": 5225, "mas": 5226, "âĤ": 5227, "courtesy</w>": 5228, "frank</w>": 5229, "increase</w>": 5230, "written</w>": 5231, "ppers</w>": 5232, "rel</w>": 5233, "hai</w>": 5234, "sas</w>": 5235, "sound": 5236, "tti</w>": 5237, "wich</w>": 5238, "river": 5239, "...\"</w>": 5240, "ag</w>": 5241, "fellow</w>": 5242, "rome</w>": 5243, "small": 5244, "gency</w>": 5245, "ican</w>": 5246, "luxury</w>": 5247, "proof</w>": 5248, "met": 5249, "wildlife</w>": 5250, "moments</w>": 5251, "rather</w>": 5252, "corner</w>": 5253, "compe": 5254, "canadian</w>": 5255, "likely</w>": 5256, "therapy</w>": 5257, "liam": 5258, "economic</w>": 5259, "indie": 5260, "route</w>": 5261, "fight": 5262, "hope": 5263, "setting</w>": 5264, "antly</w>": 5265, "cross": 5266, "fantasy</w>": 5267, "dee": 5268, "sketch</w>": 5269, "compli": 5270, "ymi</w>": 5271, "rules</w>": 5272, "engineering</w>": 5273, "figure</w>": 5274, "row": 5275, ".,</w>": 5276, "fw</w>": 5277, "sydney</w>": 5278, "wou": 5279, "tation</w>": 5280, "drew</w>": 5281, "uses</w>": 5282, "there": 5283, "spread</w>": 5284, "structure</w>": 5285, "patrick</w>": 5286, "apparently</w>": 5287, "ros": 5288, "hills</w>": 5289, "wwe</w>": 5290, "anny</w>": 5291, "commission</w>": 5292, "div": 5293, "fying</w>": 5294, "consul": 5295, "analysis</w>": 5296, "exi": 5297, "tennis</w>": 5298, "vehicle</w>": 5299, "ðŁĺŃðŁĺŃ": 5300, "ass</w>": 5301, "highly</w>": 5302, "opened</w>": 5303, "bann": 5304, "ðŁĴĻ": 5305, "mph</w>": 5306, "wishing</w>": 5307, "vor</w>": 5308, "fif": 5309, "giveaway</w>": 5310, "rr": 5311, "ray": 5312, "jess": 5313, "gat": 5314, "icymi</w>": 5315, "xit</w>": 5316, "highest</w>": 5317, "york": 5318, "pie</w>": 5319, "involved</w>": 5320, "higher</w>": 5321, "rie</w>": 5322, "malay": 5323, "intelli": 5324, "despite</w>": 5325, "chee": 5326, "sarah</w>": 5327, "bean</w>": 5328, "recogni": 5329, "arsen": 5330, "talented</w>": 5331, "passion</w>": 5332, "ich": 5333, "abc</w>": 5334, "leads</w>": 5335, "disease</w>": 5336, "vis</w>": 5337, "sec</w>": 5338, "presenting</w>": 5339, "milli": 5340, "hole</w>": 5341, "shots</w>": 5342, "depart": 5343, "surgery</w>": 5344, "govt</w>": 5345, "bin</w>": 5346, "dual</w>": 5347, "evi": 5348, "longer</w>": 5349, "evol": 5350, "screen": 5351, "portrait</w>": 5352, "etc</w>": 5353, "lose</w>": 5354, "chat": 5355, "pen</w>": 5356, "pi</w>": 5357, "oma</w>": 5358, "sick</w>": 5359, "erc": 5360, "companies</w>": 5361, "entry</w>": 5362, "plane</w>": 5363, "gry</w>": 5364, "vene": 5365, "liverpool</w>": 5366, "premiere</w>": 5367, "shared</w>": 5368, "ared</w>": 5369, "films</w>": 5370, "ira</w>": 5371, "holidays</w>": 5372, "cricket</w>": 5373, "ician</w>": 5374, "ving": 5375, ".)</w>": 5376, "ultimate</w>": 5377, "division</w>": 5378, "conduc": 5379, "sept</w>": 5380, "forces</w>": 5381, "mont</w>": 5382, "smart": 5383, "disapp": 5384, "sunshine</w>": 5385, "ind": 5386, "bless</w>": 5387, "made": 5388, "colors</w>": 5389, "frank": 5390, "iron</w>": 5391, "bottle</w>": 5392, "sgo": 5393, "mood</w>": 5394, "jason</w>": 5395, "eric</w>": 5396, "birth</w>": 5397, "teen": 5398, "response</w>": 5399, "target</w>": 5400, "statement</w>": 5401, "fear</w>": 5402, "thel": 5403, "alum": 5404, "arab": 5405, "blin</w>": 5406, "direction</w>": 5407, "steps</w>": 5408, "erial</w>": 5409, "worked</w>": 5410, "atl": 5411, "ðŁĴķ": 5412, "felt</w>": 5413, "poli</w>": 5414, "scenes</w>": 5415, "homes</w>": 5416, "bell": 5417, "eat": 5418, "ateful</w>": 5419, "tin</w>": 5420, "lace</w>": 5421, "folks</w>": 5422, "pse</w>": 5423, "ann</w>": 5424, "wisdom</w>": 5425, "fav</w>": 5426, "butter": 5427, "sr</w>": 5428, "areas</w>": 5429, "smoo": 5430, "biz</w>": 5431, "dges</w>": 5432, "appo": 5433, "more": 5434, "them": 5435, "effect</w>": 5436, "windows</w>": 5437, "sunny</w>": 5438, "capital</w>": 5439, "totally</w>": 5440, "cities</w>": 5441, "grant</w>": 5442, "mbers</w>": 5443, "slow</w>": 5444, "autu": 5445, "ilities</w>": 5446, "wro": 5447, "rising</w>": 5448, "stics</w>": 5449, "violence</w>": 5450, "igh</w>": 5451, "quot": 5452, "hit": 5453, "tc</w>": 5454, "heritage</w>": 5455, "buff": 5456, "nes": 5457, "zar": 5458, "dential</w>": 5459, "exac": 5460, "edge</w>": 5461, "deep": 5462, "arena</w>": 5463, "became</w>": 5464, "benefits</w>": 5465, "marks</w>": 5466, "mber": 5467, "az</w>": 5468, "ames</w>": 5469, "preci": 5470, "dragon</w>": 5471, "reg": 5472, "dings</w>": 5473, "dos</w>": 5474, "ðŁĴª": 5475, "nel": 5476, "sity</w>": 5477, "meal</w>": 5478, "dist": 5479, "legend": 5480, "purchase</w>": 5481, "pical</w>": 5482, "stick": 5483, "fat</w>": 5484, "duba": 5485, "profess": 5486, "carto": 5487, "prof</w>": 5488, "countries</w>": 5489, "responsi": 5490, "sequ": 5491, "fab</w>": 5492, "tribute</w>": 5493, "honored</w>": 5494, "practic": 5495, "purple</w>": 5496, "anton": 5497, "pared</w>": 5498, "tough</w>": 5499, "summer": 5500, "environment</w>": 5501, "sons</w>": 5502, "ðŁĻı</w>": 5503, "mps</w>": 5504, "gies</w>": 5505, "heroes</w>": 5506, "telling</w>": 5507, "henry</w>": 5508, "fen": 5509, "knowledge</w>": 5510, "Ģï¸ı</w>": 5511, "fr</w>": 5512, "neg": 5513, "ure": 5514, "acking</w>": 5515, "hearts</w>": 5516, "soo": 5517, "hollywood</w>": 5518, "jump": 5519, "sauce</w>": 5520, "schedule</w>": 5521, "turn": 5522, "yoga</w>": 5523, "creating</w>": 5524, "cket</w>": 5525, "creek</w>": 5526, "âŃ": 5527, "customers</w>": 5528, "madri": 5529, "gul": 5530, "assemb": 5531, "mount</w>": 5532, "cell</w>": 5533, "top": 5534, "stal</w>": 5535, "davis</w>": 5536, "twi": 5537, "sign": 5538, "premier</w>": 5539, "itions</w>": 5540, "hearing</w>": 5541, "unk</w>": 5542, "patients</w>": 5543, "appear": 5544, "heaven</w>": 5545, "alty</w>": 5546, "doctor</w>": 5547, "ae": 5548, "platform</w>": 5549, "jeff</w>": 5550, "ðŁĵ·</w>": 5551, "regional</w>": 5552, "bid</w>": 5553, "boxing</w>": 5554, "exten": 5555, "ority</w>": 5556, "aw</w>": 5557, "wise</w>": 5558, "ille</w>": 5559, "several</w>": 5560, "bie": 5561, "situ": 5562, "syria</w>": 5563, "âľħ</w>": 5564, "reminder</w>": 5565, "entertain": 5566, "lion</w>": 5567, "partners</w>": 5568, "inn</w>": 5569, "phar": 5570, "fau": 5571, "pls</w>": 5572, "expected</w>": 5573, "sugar</w>": 5574, "decision</w>": 5575, "sb": 5576, "chron": 5577, "association</w>": 5578, "leaves</w>": 5579, "visited</w>": 5580, "shap": 5581, "ðŁĴĸ</w>": 5582, "further</w>": 5583, "hann": 5584, "wi</w>": 5585, "runs</w>": 5586, "ler": 5587, "funding</w>": 5588, "filled</w>": 5589, "......</w>": 5590, "tiny</w>": 5591, "hang</w>": 5592, "org</w>": 5593, "cool": 5594, "semin": 5595, "ðŁıĨ</w>": 5596, "spons": 5597, "navy</w>": 5598, "saint</w>": 5599, "drug</w>": 5600, "dal</w>": 5601, "roun": 5602, "covered</w>": 5603, "traditional</w>": 5604, "investment</w>": 5605, "dete": 5606, "alism</w>": 5607, "flow</w>": 5608, "nis": 5609, "sunrise</w>": 5610, "feat</w>": 5611, "fted</w>": 5612, "weird</w>": 5613, "jere": 5614, "vegan</w>": 5615, "medicine</w>": 5616, "ano": 5617, "accu": 5618, "delivery</w>": 5619, "temple</w>": 5620, "changing</w>": 5621, "wilson</w>": 5622, "philipp": 5623, "refe": 5624, "nd": 5625, "iser</w>": 5626, "gay</w>": 5627, "rand": 5628, "atives</w>": 5629, "tely</w>": 5630, "pand": 5631, "intellig": 5632, "gare": 5633, "ambas": 5634, "demon": 5635, "committee</w>": 5636, "strategy</w>": 5637, "refuge": 5638, "budget</w>": 5639, "protec": 5640, "pier": 5641, "express</w>": 5642, "nomin": 5643, "economy</w>": 5644, "allow": 5645, "icon</w>": 5646, "galax": 5647, "oh": 5648, "indivi": 5649, "demand</w>": 5650, "virgin": 5651, "luke</w>": 5652, "alists</w>": 5653, "mani": 5654, "smi": 5655, "judge</w>": 5656, "enty</w>": 5657, "michi": 5658, "result</w>": 5659, "amed</w>": 5660, "speaks</w>": 5661, "',</w>": 5662, "houston</w>": 5663, "shin": 5664, "bing</w>": 5665, "fly": 5666, "chem": 5667, "auto</w>": 5668, "vas": 5669, "get": 5670, "arm": 5671, "thanks": 5672, "din</w>": 5673, "gang</w>": 5674, "xx": 5675, "sion": 5676, "located</w>": 5677, "pl</w>": 5678, "josh</w>": 5679, "info": 5680, "joins</w>": 5681, "adverti": 5682, "otd</w>": 5683, "eld</w>": 5684, "sie</w>": 5685, "reasons</w>": 5686, "vent</w>": 5687, "ðŁĩºðŁĩ¸</w>": 5688, "âł": 5689, "conversation</w>": 5690, "studi": 5691, "ðŁĶ¥ðŁĶ¥": 5692, "gos</w>": 5693, "sounds</w>": 5694, "unit</w>": 5695, "musc": 5696, "gel</w>": 5697, "acked</w>": 5698, "paci": 5699, "cos</w>": 5700, "dere": 5701, "uu": 5702, "ao</w>": 5703, "lam</w>": 5704, "inspiring</w>": 5705, "arms</w>": 5706, "tware</w>": 5707, "matters</w>": 5708, "addic": 5709, "dude</w>": 5710, "ext": 5711, "crisis</w>": 5712, "bath</w>": 5713, "meet": 5714, "singh</w>": 5715, "expect</w>": 5716, "delhi</w>": 5717, "rescue</w>": 5718, "worst</w>": 5719, "aug</w>": 5720, "shipping</w>": 5721, "serving</w>": 5722, "sto</w>": 5723, "dark": 5724, "aces</w>": 5725, "historic</w>": 5726, "landscape</w>": 5727, "designer</w>": 5728, "billion</w>": 5729, "grateful</w>": 5730, "wake</w>": 5731, "eve": 5732, "miller</w>": 5733, "housing</w>": 5734, "dynam": 5735, "isco</w>": 5736, "beha": 5737, "shop": 5738, "prou": 5739, "eas": 5740, "asia</w>": 5741, "eding</w>": 5742, "kon": 5743, "department</w>": 5744, "awar": 5745, "marine</w>": 5746, "inci": 5747, "photographer</w>": 5748, "tape</w>": 5749, "logo</w>": 5750, "rings</w>": 5751, "dit": 5752, "----": 5753, "vinyl</w>": 5754, "wc</w>": 5755, "voting</w>": 5756, "seven</w>": 5757, "ambassad": 5758, "dallas</w>": 5759, "tu</w>": 5760, "comment</w>": 5761, "kra": 5762, "bles</w>": 5763, "wag": 5764, "ud</w>": 5765, "audio</w>": 5766, "strike</w>": 5767, "official": 5768, "ots</w>": 5769, "metho": 5770, "tools</w>": 5771, "radi": 5772, "alan</w>": 5773, "hunt</w>": 5774, "watched</w>": 5775, "ake</w>": 5776, "fake</w>": 5777, "drinking</w>": 5778, "merry</w>": 5779, "ml</w>": 5780, "bday</w>": 5781, "rio</w>": 5782, "nike</w>": 5783, "cant</w>": 5784, "repe": 5785, "costu": 5786, "murder</w>": 5787, "akers</w>": 5788, "chers</w>": 5789, "outs</w>": 5790, "beginning</w>": 5791, "sos</w>": 5792, "ades</w>": 5793, "nin": 5794, "notes</w>": 5795, "wrote</w>": 5796, "solo</w>": 5797, "ci</w>": 5798, "lighting</w>": 5799, "urban</w>": 5800, "brexit</w>": 5801, "attend</w>": 5802, "shirts</w>": 5803, "playo": 5804, "actress</w>": 5805, "plic": 5806, "standard</w>": 5807, "quotes</w>": 5808, "parade</w>": 5809, "ancient</w>": 5810, "Â©</w>": 5811, "turing</w>": 5812, "ree</w>": 5813, "primary</w>": 5814, "flash</w>": 5815, "citiz": 5816, "mates</w>": 5817, "stein</w>": 5818, "zi</w>": 5819, "clinton</w>": 5820, "skin": 5821, "gene": 5822, "hum": 5823, "gar</w>": 5824, "tle</w>": 5825, "yi": 5826, "focu": 5827, "dean</w>": 5828, "plants</w>": 5829, "cyber": 5830, "bu</w>": 5831, "ome</w>": 5832, "hop</w>": 5833, "address</w>": 5834, "tix</w>": 5835, "gifts</w>": 5836, "relationship</w>": 5837, "subscri": 5838, "feed</w>": 5839, "exactly</w>": 5840, "hawks</w>": 5841, "exo</w>": 5842, "stress</w>": 5843, "sn</w>": 5844, "arrested</w>": 5845, "ane": 5846, "software</w>": 5847, "zero</w>": 5848, "theme</w>": 5849, "mumb": 5850, "immigr": 5851, "mia</w>": 5852, "makeup</w>": 5853, "pleasure</w>": 5854, "univers": 5855, "harb": 5856, "engine</w>": 5857, "aper</w>": 5858, "rin": 5859, "bra</w>": 5860, "institute</w>": 5861, "leather</w>": 5862, "alth</w>": 5863, "singing</w>": 5864, "cos": 5865, "ghty</w>": 5866, "meas": 5867, "stic": 5868, "side": 5869, "insurance</w>": 5870, "cot</w>": 5871, "pitch</w>": 5872, "mountains</w>": 5873, "crimin": 5874, "supre": 5875, "valentine</w>": 5876, "ater</w>": 5877, "wouldn</w>": 5878, "scale</w>": 5879, "related</w>": 5880, "regar": 5881, "startup</w>": 5882, "packed</w>": 5883, "mike": 5884, "weekly</w>": 5885, "pts</w>": 5886, "count</w>": 5887, "har</w>": 5888, "gotten</w>": 5889, "mind": 5890, "berlin</w>": 5891, "conditions</w>": 5892, "switch</w>": 5893, "corn</w>": 5894, "save": 5895, "gli": 5896, "emergency</w>": 5897, "tuned</w>": 5898, "stock": 5899, "discussing</w>": 5900, "everybody</w>": 5901, "sday": 5902, "whether</w>": 5903, "wrestling</w>": 5904, "eces</w>": 5905, "gender</w>": 5906, "chen": 5907, "ðŁĳĢ</w>": 5908, "madrid</w>": 5909, "marathon</w>": 5910, "egg</w>": 5911, "ier</w>": 5912, "thx</w>": 5913, "asking</w>": 5914, "korea</w>": 5915, "wolf</w>": 5916, "aya</w>": 5917, "gm</w>": 5918, "gau": 5919, "atory</w>": 5920, "vr</w>": 5921, "grass</w>": 5922, "killing</w>": 5923, "bble</w>": 5924, "uro</w>": 5925, "uni</w>": 5926, "eth</w>": 5927, "shore</w>": 5928, "then": 5929, "reale": 5930, "bottom</w>": 5931, "exerc": 5932, "kar</w>": 5933, "ories</w>": 5934, "adri": 5935, "sands</w>": 5936, "sex</w>": 5937, ".'</w>": 5938, "volunteers</w>": 5939, "perform</w>": 5940, "parliam": 5941, "include</w>": 5942, "delighted</w>": 5943, "executive</w>": 5944, "fuel</w>": 5945, "kiss</w>": 5946, "ãħ": 5947, "charge</w>": 5948, "hu</w>": 5949, "cakes</w>": 5950, "vet</w>": 5951, "glu": 5952, "agree</w>": 5953, "prices</w>": 5954, "nau": 5955, "hl</w>": 5956, "gru": 5957, "raj": 5958, "strength</w>": 5959, "bic": 5960, "spending</w>": 5961, "ales</w>": 5962, "aven": 5963, "blast</w>": 5964, ":(</w>": 5965, "yof": 5966, "normal</w>": 5967, "six": 5968, "quick": 5969, "sea": 5970, "daw": 5971, "meets</w>": 5972, "lovers</w>": 5973, "updated</w>": 5974, "potat": 5975, "completed</w>": 5976, "cook</w>": 5977, "opportunities</w>": 5978, "pure</w>": 5979, "organic</w>": 5980, "temper": 5981, "cam</w>": 5982, "avoid</w>": 5983, "parking</w>": 5984, "dubai</w>": 5985, "ando</w>": 5986, "distri": 5987, "toy</w>": 5988, "completely</w>": 5989, "donald": 5990, "trial</w>": 5991, "bass</w>": 5992, "boun": 5993, "background</w>": 5994, "vas</w>": 5995, "marvel</w>": 5996, "lum</w>": 5997, "rus</w>": 5998, "tool</w>": 5999, "commissi": 6000, "throwback": 6001, "finding</w>": 6002, "islam": 6003, "!?</w>": 6004, "stop": 6005, "evil</w>": 6006, "oral</w>": 6007, "residents</w>": 6008, "identi": 6009, "oak": 6010, "ðŁİ¶</w>": 6011, "lil": 6012, "spanish</w>": 6013, "chapter</w>": 6014, "stopped</w>": 6015, "direct</w>": 6016, "hosted</w>": 6017, "picked</w>": 6018, "labour</w>": 6019, "lewis</w>": 6020, "defense</w>": 6021, "à®": 6022, "healthcare</w>": 6023, "whis": 6024, "math</w>": 6025, "peak</w>": 6026, "raised</w>": 6027, "fix</w>": 6028, "bull</w>": 6029, "thir": 6030, "chelsea</w>": 6031, "folk</w>": 6032, "tre</w>": 6033, "candi": 6034, "paul": 6035, "either</w>": 6036, "adam": 6037, "poetry</w>": 6038, "jewelry</w>": 6039, "ðŁ¦": 6040, "pray</w>": 6041, "Ø§": 6042, "gc</w>": 6043, "oz</w>": 6044, "wishes</w>": 6045, "foreign</w>": 6046, "sung</w>": 6047, "learned</w>": 6048, "ene</w>": 6049, "ning": 6050, "michael": 6051, "illustration</w>": 6052, "legendary</w>": 6053, "wav": 6054, "bau": 6055, "ðŁļ¨</w>": 6056, "calend": 6057, "streets</w>": 6058, "âĨ": 6059, "monster</w>": 6060, "buck": 6061, "gr</w>": 6062, "school": 6063, "bath": 6064, "waste</w>": 6065, "neck": 6066, "hawa": 6067, "beach": 6068, "replac": 6069, "ject</w>": 6070, "oner</w>": 6071, "factory</w>": 6072, "count": 6073, "ðŁĵ¸</w>": 6074, "morgan</w>": 6075, "dering</w>": 6076, "sean</w>": 6077, "stephen</w>": 6078, "dep": 6079, "novel</w>": 6080, "videos</w>": 6081, "ical": 6082, "pressure</w>": 6083, "arsenal</w>": 6084, "expre": 6085, "irs</w>": 6086, "trending</w>": 6087, "ssa</w>": 6088, "flash": 6089, "resear": 6090, "through": 6091, "professor</w>": 6092, "sculp": 6093, "tos</w>": 6094, "gged</w>": 6095, "mma</w>": 6096, "bee": 6097, "ape": 6098, "hunter</w>": 6099, "ami": 6100, "hei": 6101, "plastic</w>": 6102, "bucks</w>": 6103, "universe</w>": 6104, "legen": 6105, "nigeria</w>": 6106, "pleased</w>": 6107, "ris": 6108, "thinks</w>": 6109, "autumn</w>": 6110, "ids</w>": 6111, "dis</w>": 6112, "anthony</w>": 6113, "ðŁı½</w>": 6114, "aked</w>": 6115, "glasses</w>": 6116, "finance</w>": 6117, "zer": 6118, "kas": 6119, "contract</w>": 6120, "numbers</w>": 6121, "shaw": 6122, "partnership</w>": 6123, "til": 6124, "launched</w>": 6125, "sal</w>": 6126, "victoria</w>": 6127, "theater</w>": 6128, "usual</w>": 6129, "names</w>": 6130, "period</w>": 6131, "eliza": 6132, "ith": 6133, "barcel": 6134, "rocks</w>": 6135, "bags</w>": 6136, "mate": 6137, "distribu": 6138, "jon</w>": 6139, "diffic": 6140, "alized</w>": 6141, "curren": 6142, "scored</w>": 6143, "bha": 6144, "dublin</w>": 6145, "rose": 6146, "inted</w>": 6147, "solid</w>": 6148, "behavi": 6149, "walker</w>": 6150, "simply</w>": 6151, "gardens</w>": 6152, "headed</w>": 6153, "ini": 6154, "ohio</w>": 6155, "weap": 6156, "fo</w>": 6157, "glen": 6158, "estate</w>": 6159, "random</w>": 6160, "thunder": 6161, "thru</w>": 6162, "kill": 6163, "jacket</w>": 6164, "iti</w>": 6165, "entertainment</w>": 6166, "thanksgiving</w>": 6167, "ental</w>": 6168, "encoura": 6169, "elo": 6170, "ather": 6171, "tank</w>": 6172, "highlights</w>": 6173, "fting</w>": 6174, "rule</w>": 6175, "models</w>": 6176, "border</w>": 6177, "bjp</w>": 6178, "husband</w>": 6179, "indone": 6180, "kenya</w>": 6181, "bears</w>": 6182, "alo</w>": 6183, "ninten": 6184, "pix": 6185, "stro</w>": 6186, "orders</w>": 6187, "salad</w>": 6188, "roads</w>": 6189, "nor</w>": 6190, "lation</w>": 6191, "sophi": 6192, "ðŁı¼": 6193, "pieces</w>": 6194, "bone</w>": 6195, "mins</w>": 6196, "includes</w>": 6197, "nutr": 6198, "phil</w>": 6199, "sent": 6200, "fundra": 6201, "gain</w>": 6202, "borough</w>": 6203, "nad": 6204, "monday": 6205, "activity</w>": 6206, "items</w>": 6207, "becoming</w>": 6208, "kenne": 6209, "detro": 6210, "cardi": 6211, "guests</w>": 6212, "ux</w>": 6213, "worldwide</w>": 6214, "severe</w>": 6215, "news": 6216, "thankful</w>": 6217, "fiction</w>": 6218, "vege": 6219, "mall</w>": 6220, "sian</w>": 6221, "eral</w>": 6222, "injury</w>": 6223, "lee": 6224, "menu</w>": 6225, "dancing</w>": 6226, "scotti": 6227, "example</w>": 6228, "(#</w>": 6229, "nai": 6230, "studios</w>": 6231, "bai": 6232, "ðŁĴĽ</w>": 6233, "jav": 6234, "diamond</w>": 6235, "vince</w>": 6236, "rick": 6237, "protection</w>": 6238, "lincol": 6239, "champs</w>": 6240, "approach</w>": 6241, "dar</w>": 6242, "mile</w>": 6243, "clouds</w>": 6244, "jeff": 6245, "infin": 6246, "lers</w>": 6247, "ples</w>": 6248, "peace": 6249, "gop</w>": 6250, "âĻ¡</w>": 6251, "techn": 6252, "stra</w>": 6253, "average</w>": 6254, "effort</w>": 6255, "introducing</w>": 6256, "diversity</w>": 6257, "australian</w>": 6258, "amp</w>": 6259, "boost</w>": 6260, "ske": 6261, "patient</w>": 6262, "appreciate</w>": 6263, "icians</w>": 6264, "pur</w>": 6265, "fell</w>": 6266, "woods</w>": 6267, "illustr": 6268, "ðŁĸ": 6269, "agency</w>": 6270, "actions</w>": 6271, "britain</w>": 6272, "underway</w>": 6273, "seattle</w>": 6274, "eland</w>": 6275, "ago": 6276, "fill</w>": 6277, "streaming</w>": 6278, "protest</w>": 6279, "challenges</w>": 6280, "kyo</w>": 6281, "etsy</w>": 6282, "cooking</w>": 6283, "expert</w>": 6284, "russ": 6285, "rainbow</w>": 6286, "commercial</w>": 6287, "spin": 6288, "beats</w>": 6289, "cry</w>": 6290, "valu": 6291, "eli</w>": 6292, "throw</w>": 6293, "grams</w>": 6294, "levels</w>": 6295, "michigan</w>": 6296, "cad": 6297, "adorable</w>": 6298, "constitu": 6299, "ws": 6300, "pub</w>": 6301, "midnight</w>": 6302, "that": 6303, "netfli": 6304, "brazil</w>": 6305, "diego</w>": 6306, "regular</w>": 6307, "joy": 6308, "âĤ¬</w>": 6309, "liqu": 6310, "eastern</w>": 6311, "kni": 6312, "flat</w>": 6313, "np</w>": 6314, "brown": 6315, "wer": 6316, "sey": 6317, "tters</w>": 6318, "acting</w>": 6319, "vanc": 6320, "cycling</w>": 6321, "programme</w>": 6322, "raw</w>": 6323, "complex</w>": 6324, "tattoo</w>": 6325, "throwbackthursday</w>": 6326, "sessions</w>": 6327, "rooms</w>": 6328, "sight</w>": 6329, "species</w>": 6330, "bomb</w>": 6331, "laugh</w>": 6332, "keeps</w>": 6333, "moon": 6334, "officers</w>": 6335, "conver": 6336, "tr</w>": 6337, "hash": 6338, "tack": 6339, "rious</w>": 6340, "adap": 6341, "aj</w>": 6342, "recogn": 6343, "expo</w>": 6344, "sugge": 6345, "confirmed</w>": 6346, "rolling</w>": 6347, "dressing</w>": 6348, "ict</w>": 6349, "friday": 6350, "phones</w>": 6351, "ridge</w>": 6352, "concept</w>": 6353, "roy</w>": 6354, "keys</w>": 6355, "effor": 6356, "cate": 6357, "kne": 6358, "even": 6359, "lay</w>": 6360, "communities</w>": 6361, "mod": 6362, "naz": 6363, "everywhere</w>": 6364, "alab": 6365, "bitcoin</w>": 6366, "banks</w>": 6367, "outdoor</w>": 6368, "federal</w>": 6369, "stores</w>": 6370, "hp</w>": 6371, "cal</w>": 6372, "mely</w>": 6373, "signific": 6374, "bear": 6375, "republic": 6376, "closer</w>": 6377, "allah</w>": 6378, "pick": 6379, "xd</w>": 6380, "palace</w>": 6381, "chill</w>": 6382, "bam": 6383, "erous</w>": 6384, "una</w>": 6385, "allen</w>": 6386, "outstanding</w>": 6387, "olympic</w>": 6388, "supply</w>": 6389, "figu": 6390, "vau": 6391, "lp</w>": 6392, "charlie</w>": 6393, "unes</w>": 6394, ">>></w>": 6395, "legends</w>": 6396, "icial</w>": 6397, "coast": 6398, "benefit</w>": 6399, "multi</w>": 6400, "fits</w>": 6401, "farmers</w>": 6402, "amount</w>": 6403, "sisters</w>": 6404, "harve": 6405, "honey</w>": 6406, "queen": 6407, "bers</w>": 6408, "plann": 6409, "âŃĲ": 6410, "mu</w>": 6411, "barcelona</w>": 6412, "alber": 6413, "status</w>": 6414, "remain</w>": 6415, "extra": 6416, "candy</w>": 6417, "vious</w>": 6418, "âľĮ": 6419, "ov": 6420, "warriors</w>": 6421, "--></w>": 6422, "jump</w>": 6423, "amar": 6424, "xmas</w>": 6425, "studies</w>": 6426, "iors</w>": 6427, "kor": 6428, "donate</w>": 6429, "prep": 6430, "fish": 6431, "ima</w>": 6432, "painted</w>": 6433, "admini": 6434, "cosplay</w>": 6435, "sports": 6436, "drops</w>": 6437, "fighter</w>": 6438, "evidence</w>": 6439, "ðŁĴª</w>": 6440, "lake": 6441, "rob</w>": 6442, "cinema</w>": 6443, "profile</w>": 6444, "Ã±": 6445, "stands</w>": 6446, "legacy</w>": 6447, "shape</w>": 6448, "roof</w>": 6449, "civil</w>": 6450, "ians</w>": 6451, "syl": 6452, "sham": 6453, "voted</w>": 6454, "retail</w>": 6455, "philli": 6456, "listed</w>": 6457, "duty</w>": 6458, "nb": 6459, "thes</w>": 6460, "fare</w>": 6461, "auction</w>": 6462, "fficial</w>": 6463, "storms</w>": 6464, "dp</w>": 6465, "loun": 6466, "shops</w>": 6467, "aly": 6468, "anime</w>": 6469, "multiple</w>": 6470, "ðŁĺįðŁĺį</w>": 6471, "psycho": 6472, "jean</w>": 6473, "apart": 6474, "candidate</w>": 6475, "ggy</w>": 6476, "conf</w>": 6477, "joseph</w>": 6478, "wick</w>": 6479, "meat</w>": 6480, "frame</w>": 6481, "cl</w>": 6482, "forgot</w>": 6483, "phy": 6484, "fing": 6485, "lied</w>": 6486, "rep</w>": 6487, "seed</w>": 6488, "fall": 6489, "ufc</w>": 6490, "nut</w>": 6491, "lind": 6492, "mode</w>": 6493, "fields</w>": 6494, "ence": 6495, "sley</w>": 6496, "ðŁ¤Ķ</w>": 6497, "chill": 6498, "followed</w>": 6499, "announces</w>": 6500, "corru": 6501, "trophy</w>": 6502, "themselves</w>": 6503, "acle</w>": 6504, "aldu": 6505, "kong</w>": 6506, "lon</w>": 6507, "sv": 6508, "broke</w>": 6509, "anderson</w>": 6510, "tai": 6511, "story": 6512, "temporary</w>": 6513, "activities</w>": 6514, "kati": 6515, "ariz": 6516, "crystal</w>": 6517, "spoke</w>": 6518, "extremely</w>": 6519, "trading</w>": 6520, "ðŁĴļ</w>": 6521, "Ã¼": 6522, "inch</w>": 6523, "edin": 6524, "outfit</w>": 6525, "equip": 6526, "madi": 6527, "formed</w>": 6528, "beef</w>": 6529, "pop": 6530, "tiger</w>": 6531, "thisday</w>": 6532, "tired</w>": 6533, "neighb": 6534, "retro": 6535, "isa</w>": 6536, "unt</w>": 6537, "tas": 6538, "kansas</w>": 6539, "dest": 6540, "seconds</w>": 6541, "tay": 6542, "hurric": 6543, "ou</w>": 6544, "galaxy</w>": 6545, "daddy</w>": 6546, "brow": 6547, "burger</w>": 6548, "enced</w>": 6549, "desk</w>": 6550, "accur": 6551, "secretary</w>": 6552, "elite</w>": 6553, "kab": 6554, "chin": 6555, "tourism</w>": 6556, "buddy</w>": 6557, "icide</w>": 6558, "dressed</w>": 6559, "ud": 6560, "vacation</w>": 6561, "cheers</w>": 6562, "comfor": 6563, "characters</w>": 6564, "jet</w>": 6565, "buying</w>": 6566, "lins</w>": 6567, "nap": 6568, "realestate</w>": 6569, "lie": 6570, "afc</w>": 6571, "iii</w>": 6572, "fame</w>": 6573, "nr": 6574, "bat</w>": 6575, "agent</w>": 6576, "makers</w>": 6577, "âĢ¼": 6578, "sector</w>": 6579, "opti": 6580, "leon": 6581, "diet</w>": 6582, "prayer</w>": 6583, "hip</w>": 6584, "mir</w>": 6585, "lex": 6586, "bry": 6587, "ana": 6588, "passing</w>": 6589, "wen": 6590, "recovery</w>": 6591, "aki</w>": 6592, "popul": 6593, "resort</w>": 6594, "maria</w>": 6595, "stuck</w>": 6596, "reads</w>": 6597, "tier</w>": 6598, "perfec": 6599, "netflix</w>": 6600, "poo": 6601, "champ</w>": 6602, "oc</w>": 6603, "reduce</w>": 6604, "wered</w>": 6605, "comments</w>": 6606, "claim</w>": 6607, "accident</w>": 6608, "sag": 6609, "hack": 6610, "salt</w>": 6611, "kinda</w>": 6612, "killer</w>": 6613, "ios</w>": 6614, "zy": 6615, "exchange</w>": 6616, "lecture</w>": 6617, "enger</w>": 6618, "icking</w>": 6619, "tau": 6620, "reveals</w>": 6621, "prison</w>": 6622, "zom": 6623, "ghan</w>": 6624, "ul</w>": 6625, "journal</w>": 6626, "iot</w>": 6627, "trin": 6628, "jona": 6629, "governor</w>": 6630, "cape</w>": 6631, "quarter</w>": 6632, "spective</w>": 6633, "impressive</w>": 6634, "babies</w>": 6635, "tx": 6636, "mill</w>": 6637, "oy": 6638, "harri": 6639, "joint</w>": 6640, "sue</w>": 6641, "collaboration</w>": 6642, "trend</w>": 6643, "revolution</w>": 6644, "renew": 6645, "alumni</w>": 6646, "gett": 6647, "shell</w>": 6648, "sunday": 6649, "entu": 6650, "nic</w>": 6651, "donaldtrump</w>": 6652, "blockchain</w>": 6653, "pacific</w>": 6654, "explains</w>": 6655, "spy</w>": 6656, "advoc": 6657, "paradi": 6658, "tof": 6659, "starring</w>": 6660, "pav": 6661, "feed": 6662, "brac": 6663, "smoke</w>": 6664, "hamp": 6665, "yam": 6666, "tokyo</w>": 6667, "simon</w>": 6668, "dh": 6669, "effici": 6670, "physical</w>": 6671, "nj</w>": 6672, "elli</w>": 6673, "slow": 6674, "graduate</w>": 6675, "americans</w>": 6676, "tify</w>": 6677, "fred</w>": 6678, "apore</w>": 6679, "finds</w>": 6680, "robin": 6681, "wet</w>": 6682, "notice</w>": 6683, "semi</w>": 6684, "unve": 6685, "kom": 6686, "pilot</w>": 6687, "screening</w>": 6688, "daily": 6689, "ðŁĴĹ</w>": 6690, "royal": 6691, "spa</w>": 6692, "votes</w>": 6693, "nag": 6694, "whate": 6695, "attending</w>": 6696, "experim": 6697, "addition</w>": 6698, "kate</w>": 6699, "stol</w>": 6700, "mali": 6701, "foot": 6702, "christ</w>": 6703, "chan</w>": 6704, "dee</w>": 6705, "licen": 6706, "global": 6707, "moore</w>": 6708, "tia</w>": 6709, "brigh": 6710, "mystery</w>": 6711, "yay</w>": 6712, "âĿ¤ï¸ıâĿ¤ï¸ı": 6713, "creati": 6714, "mechan": 6715, "clock</w>": 6716, "dic</w>": 6717, "âĢĶ": 6718, "pper": 6719, "alph": 6720, "throughout</w>": 6721, "allow</w>": 6722, "resources</w>": 6723, "selection</w>": 6724, "hamil": 6725, "bbq</w>": 6726, "aaaa": 6727, "virginia</w>": 6728, "disney": 6729, "eng</w>": 6730, "sored</w>": 6731, "drinks</w>": 6732, "fancy</w>": 6733, "consider</w>": 6734, "enda</w>": 6735, "jane</w>": 6736, "handmade</w>": 6737, "dul": 6738, "ontari": 6739, "ius</w>": 6740, "sville</w>": 6741, "colorado</w>": 6742, "whatever</w>": 6743, "wheel</w>": 6744, "promise</w>": 6745, "never": 6746, "designs</w>": 6747, "ably</w>": 6748, "sexual</w>": 6749, "vancou": 6750, "ati</w>": 6751, "convention</w>": 6752, "cultural</w>": 6753, "singapore</w>": 6754, "promo</w>": 6755, "loaded</w>": 6756, "glasgo": 6757, "ppl</w>": 6758, "noo": 6759, "kee</w>": 6760, "stem</w>": 6761, "mention</w>": 6762, "ido": 6763, "cruise</w>": 6764, "riding</w>": 6765, "becomes</w>": 6766, "bey</w>": 6767, "âļ½ï¸ı</w>": 6768, "twin</w>": 6769, "dedicated</w>": 6770, "nash": 6771, "desi": 6772, "workout</w>": 6773, "jenni": 6774, "iv": 6775, "groups</w>": 6776, "relax": 6777, "phoeni": 6778, "lift</w>": 6779, "mixed</w>": 6780, "mck": 6781, "pc": 6782, "must": 6783, "metro</w>": 6784, "cies</w>": 6785, "yar": 6786, "aim": 6787, "anger</w>": 6788, "ie": 6789, "recy": 6790, "married</w>": 6791, "dropped</w>": 6792, "engag": 6793, "lest</w>": 6794, "ambassador</w>": 6795, "oph": 6796, "des": 6797, "wick": 6798, "assistant</w>": 6799, "natur": 6800, "fail</w>": 6801, "ltd</w>": 6802, "short": 6803, "kap": 6804, "shaw</w>": 6805, "bigger</w>": 6806, "remains</w>": 6807, "critical</w>": 6808, "survey</w>": 6809, "coverage</w>": 6810, "erson</w>": 6811, "wind": 6812, "nb</w>": 6813, "billy</w>": 6814, "letes</w>": 6815, "acts</w>": 6816, "jimmy</w>": 6817, "atlan": 6818, "aland</w>": 6819, "tc": 6820, "importance</w>": 6821, "damage</w>": 6822, "fg</w>": 6823, "storage</w>": 6824, "twt</w>": 6825, "bond</w>": 6826, "balance</w>": 6827, "crying</w>": 6828, "puppy</w>": 6829, "vote": 6830, "push</w>": 6831, "ðŁĴľ": 6832, "poly": 6833, "mel</w>": 6834, "london": 6835, "terrori": 6836, "effective</w>": 6837, "corporate</w>": 6838, "atlanta</w>": 6839, "jaco": 6840, "nasa</w>": 6841, "greek</w>": 6842, "senate</w>": 6843, "ish": 6844, "eva</w>": 6845, "intelligence</w>": 6846, "efforts</w>": 6847, "alco": 6848, "kun": 6849, "hall": 6850, "diag": 6851, "claims</w>": 6852, "first": 6853, "hb": 6854, "bae</w>": 6855, "vul": 6856, "pull</w>": 6857, "Â°</w>": 6858, "separ": 6859, "speed": 6860, "victi": 6861, "onthisday</w>": 6862, "audience</w>": 6863, "rates</w>": 6864, "teach</w>": 6865, "filming</w>": 6866, "bush</w>": 6867, "song": 6868, "yum": 6869, "brun": 6870, "raine</w>": 6871, "awa</w>": 6872, "parks</w>": 6873, "ðĿ": 6874, "rabb": 6875, "rach": 6876, "raid</w>": 6877, "reached</w>": 6878, "rail</w>": 6879, "moves</w>": 6880, "selected</w>": 6881, "fri</w>": 6882, "raising</w>": 6883, "omy</w>": 6884, "stones</w>": 6885, "suk</w>": 6886, "francisco</w>": 6887, "cases</w>": 6888, "capit": 6889, "confu": 6890, "wtf</w>": 6891, "poke": 6892, "equipment</w>": 6893, "greg": 6894, "essential</w>": 6895, "offering</w>": 6896, "nex": 6897, "pies</w>": 6898, "bec": 6899, "creation</w>": 6900, "chairman</w>": 6901, "crown</w>": 6902, "wal</w>": 6903, "johnny</w>": 6904, "shift</w>": 6905, "neck</w>": 6906, "bang</w>": 6907, "bird": 6908, "ðŁĺı</w>": 6909, "duck</w>": 6910, "reserve</w>": 6911, "depu": 6912, "masters</w>": 6913, "overall</w>": 6914, "notic": 6915, "juice</w>": 6916, "sneak</w>": 6917, "cheer</w>": 6918, "classes</w>": 6919, "eagles</w>": 6920, "nca": 6921, "carpet</w>": 6922, "civil": 6923, "coaches</w>": 6924, "harris</w>": 6925, "ups</w>": 6926, "balls</w>": 6927, "decor</w>": 6928, "martin": 6929, "ros</w>": 6930, "vice</w>": 6931, "announcement</w>": 6932, "whose</w>": 6933, "tigers</w>": 6934, "stered</w>": 6935, "cts</w>": 6936, "dram": 6937, "steel": 6938, "young": 6939, "install": 6940, "suppo": 6941, "recording</w>": 6942, "deck</w>": 6943, "seats</w>": 6944, "lder</w>": 6945, "angle</w>": 6946, "bot</w>": 6947, "styles</w>": 6948, "elections</w>": 6949, "fortun": 6950, "nab": 6951, "butter</w>": 6952, "arian</w>": 6953, "kash": 6954, "inner</w>": 6955, "oured</w>": 6956, "beast</w>": 6957, "wei": 6958, "iconic</w>": 6959, "experts</w>": 6960, "necess": 6961, "beng": 6962, "james": 6963, "lia</w>": 6964, "greece</w>": 6965, "ðŁĵ·": 6966, "ðŁĺģ": 6967, "goodbye</w>": 6968, "mitch": 6969, "twice</w>": 6970, "mumbai</w>": 6971, "steam</w>": 6972, "rush</w>": 6973, "medal</w>": 6974, "nett</w>": 6975, "fashion": 6976, "tar</w>": 6977, "rs": 6978, "saving</w>": 6979, "ricul": 6980, "lm": 6981, "sleeping</w>": 6982, "brooklyn</w>": 6983, "miss": 6984, "sending</w>": 6985, "discovered</w>": 6986, "sphere</w>": 6987, "oftheday</w>": 6988, "kicks</w>": 6989, "missions</w>": 6990, "wright</w>": 6991, "ern": 6992, "ghtly</w>": 6993, "ious</w>": 6994, "melbourne</w>": 6995, "startu": 6996, "moved</w>": 6997, "carry</w>": 6998, "dak": 6999, "agues</w>": 7000, "belgi": 7001, "ema": 7002, "wayne</w>": 7003, "dot</w>": 7004, "erie</w>": 7005, "pel</w>": 7006, "itunes</w>": 7007, "matthew</w>": 7008, "nobody</w>": 7009, "estab": 7010, "calm</w>": 7011, "winds</w>": 7012, "luc": 7013, "prepare</w>": 7014, "trends</w>": 7015, "exercise</w>": 7016, "advant": 7017, "ðŁĴ¯</w>": 7018, "athletics</w>": 7019, "apps</w>": 7020, "ctions</w>": 7021, "advance</w>": 7022, "launches</w>": 7023, "little": 7024, "realdonaldtrump</w>": 7025, "elizabeth</w>": 7026, "carolina</w>": 7027, "hub</w>": 7028, "hidden</w>": 7029, "nw</w>": 7030, "user</w>": 7031, "poll</w>": 7032, "greater</w>": 7033, "most": 7034, "fed</w>": 7035, "pat</w>": 7036, "lifestyle</w>": 7037, "sati": 7038, "scores</w>": 7039, "marriage</w>": 7040, "lr</w>": 7041, "avenue</w>": 7042, "deserve</w>": 7043, "rif": 7044, "ðŁĹ": 7045, "watch": 7046, "championships</w>": 7047, "gray</w>": 7048, "enni": 7049, "cotton</w>": 7050, "gom": 7051, "where": 7052, "package</w>": 7053, "sum": 7054, "absolu": 7055, "newly</w>": 7056, "foods</w>": 7057, "tyler</w>": 7058, "assembly</w>": 7059, "muslim</w>": 7060, "bank": 7061, "rememb": 7062, "options</w>": 7063, "producer</w>": 7064, "lando</w>": 7065, "funds</w>": 7066, "upper</w>": 7067, "shadow</w>": 7068, "progre": 7069, "cop</w>": 7070, "inge</w>": 7071, "legs</w>": 7072, "detroit</w>": 7073, "hillary</w>": 7074, "jose</w>": 7075, "giants</w>": 7076, "soup</w>": 7077, "sustainable</w>": 7078, "tus</w>": 7079, "clothes</w>": 7080, "rocking</w>": 7081, "nz</w>": 7082, "minne": 7083, "materi": 7084, "bruce</w>": 7085, "eart": 7086, "casting</w>": 7087, "independent</w>": 7088, "thousands</w>": 7089, "tah</w>": 7090, "decl": 7091, "veterans</w>": 7092, "lions</w>": 7093, "wrap</w>": 7094, "âĢ¦": 7095, "dess": 7096, "bling</w>": 7097, "stine</w>": 7098, "eggs</w>": 7099, "oon</w>": 7100, "closing</w>": 7101, "zay": 7102, "att</w>": 7103, "bacon</w>": 7104, "fail": 7105, "arizona</w>": 7106, "depre": 7107, "ghost</w>": 7108, "newsp": 7109, "wers</w>": 7110, "vip</w>": 7111, "liked</w>": 7112, "ident": 7113, "volunteer</w>": 7114, "adult</w>": 7115, "pupp": 7116, "circle</w>": 7117, "material</w>": 7118, "degree</w>": 7119, "grown</w>": 7120, "boom</w>": 7121, "calendar</w>": 7122, "sur</w>": 7123, "viewing</w>": 7124, "athletes</w>": 7125, "chand": 7126, "rell</w>": 7127, "asian</w>": 7128, "entr": 7129, "volley": 7130, "victims</w>": 7131, "body": 7132, "mama</w>": 7133, "transfer</w>": 7134, "geek</w>": 7135, "indic": 7136, "saved</w>": 7137, "mai": 7138, "gent</w>": 7139, "its": 7140, "lounge</w>": 7141, "kol": 7142, "theory</w>": 7143, "situation</w>": 7144, "islands</w>": 7145, "arth": 7146, "zoo</w>": 7147, "flood</w>": 7148, "viously</w>": 7149, "showed</w>": 7150, "parliament</w>": 7151, "chev": 7152, "eline</w>": 7153, "attrac": 7154, "abad</w>": 7155, "tail": 7156, "hrs</w>": 7157, "lus</w>": 7158, "portu": 7159, "gory</w>": 7160, "provides</w>": 7161, "toys</w>": 7162, "death": 7163, "infe": 7164, "ance": 7165, "gle": 7166, "liam</w>": 7167, "lover</w>": 7168, "hud": 7169, "dvd</w>": 7170, "revealed</w>": 7171, "gw": 7172, "rement</w>": 7173, "cathe": 7174, "lying</w>": 7175, "radio": 7176, "derby</w>": 7177, "stors</w>": 7178, "chemi": 7179, "hospit": 7180, "âľ¨": 7181, "':</w>": 7182, "ilove": 7183, "lemon</w>": 7184, "republic</w>": 7185, "sni": 7186, "ness": 7187, "door": 7188, "reaction</w>": 7189, "pregn": 7190, "flav": 7191, "scholar": 7192, "spotify</w>": 7193, "isation</w>": 7194, "visual</w>": 7195, "aware</w>": 7196, "sponsored</w>": 7197, "joke</w>": 7198, "lessons</w>": 7199, "legis": 7200, "lock": 7201, "simil": 7202, "ðŁĺĭ</w>": 7203, "kind": 7204, "lay": 7205, "mah": 7206, "hoping</w>": 7207, "vancouver</w>": 7208, "aser</w>": 7209, "cleaning</w>": 7210, "gala</w>": 7211, "threat</w>": 7212, "lap": 7213, "ache</w>": 7214, "romance</w>": 7215, "expen": 7216, "repost</w>": 7217, "zam": 7218, "epi": 7219, "mirror</w>": 7220, "oak</w>": 7221, "adul": 7222, "batman</w>": 7223, "slu": 7224, "lc</w>": 7225, "viewed</w>": 7226, "reviews</w>": 7227, "dates</w>": 7228, "indonesia</w>": 7229, "activi": 7230, "offen": 7231, "leaf</w>": 7232, "isi": 7233, "agricul": 7234, "costume</w>": 7235, "sites</w>": 7236, "spiritu": 7237, "appearance</w>": 7238, "iry</w>": 7239, "stair": 7240, "application</w>": 7241, "spectac": 7242, "icity</w>": 7243, "skies</w>": 7244, "handle</w>": 7245, "punk</w>": 7246, "paradise</w>": 7247, "tn</w>": 7248, "deal": 7249, "providing</w>": 7250, "doc</w>": 7251, "receiving</w>": 7252, "brew</w>": 7253, "microsoft</w>": 7254, "Ã¶": 7255, "ferr": 7256, "metro": 7257, "thail": 7258, "yum</w>": 7259, "carter</w>": 7260, "Ã¡": 7261, "gentle": 7262, "breaks</w>": 7263, "cooper": 7264, "showcase</w>": 7265, "cutting</w>": 7266, "egypt</w>": 7267, "baby": 7268, "seminar</w>": 7269, "glori": 7270, "sson</w>": 7271, "fave</w>": 7272, "rehear": 7273, "lotte</w>": 7274, "lady": 7275, "alas": 7276, "prep</w>": 7277, "delivered</w>": 7278, "nuclear</w>": 7279, "iro</w>": 7280, "engagement</w>": 7281, "atta": 7282, "conven": 7283, "zan": 7284, "glory</w>": 7285, "holds</w>": 7286, "businesses</w>": 7287, "strange</w>": 7288, "sche</w>": 7289, "itself</w>": 7290, "grad</w>": 7291, "markets</w>": 7292, "falling</w>": 7293, "stats</w>": 7294, "geon</w>": 7295, "budd": 7296, "lis": 7297, "sheet</w>": 7298, "thisi": 7299, "colo": 7300, "desert</w>": 7301, "registration</w>": 7302, "ign": 7303, "explain</w>": 7304, "interior</w>": 7305, "laws</w>": 7306, "writers</w>": 7307, "springs</w>": 7308, "kr": 7309, "fried</w>": 7310, "bloom": 7311, "infra": 7312, "ao": 7313, "cred": 7314, "past": 7315, "lineup</w>": 7316, "boo</w>": 7317, "brea": 7318, "boots</w>": 7319, "celebrity</w>": 7320, "attacks</w>": 7321, "brook</w>": 7322, "eves</w>": 7323, "excu": 7324, "cherry</w>": 7325, "oop</w>": 7326, "fascin": 7327, "boyfriend</w>": 7328, "seas": 7329, "nine</w>": 7330, "effects</w>": 7331, "powered</w>": 7332, "kha": 7333, "ðŁĺĢ</w>": 7334, "shout": 7335, "condition</w>": 7336, "ij": 7337, "hero": 7338, "enterpri": 7339, "winter": 7340, "applications</w>": 7341, "shoe</w>": 7342, "gel": 7343, "battle": 7344, "programs</w>": 7345, "wart</w>": 7346, "ðŁĴ¥</w>": 7347, "rap</w>": 7348, "hol</w>": 7349, "dangerous</w>": 7350, "dia": 7351, "counter</w>": 7352, "rics</w>": 7353, "ior": 7354, "knight</w>": 7355, "coat</w>": 7356, "emotional</w>": 7357, "atures</w>": 7358, "das</w>": 7359, "wheel": 7360, "forecast</w>": 7361, "transport</w>": 7362, "glasgow</w>": 7363, "kingdom</w>": 7364, "preparing</w>": 7365, "immedi": 7366, "ffin</w>": 7367, "awarded</w>": 7368, "printing</w>": 7369, "roman</w>": 7370, "fighters</w>": 7371, "anymore</w>": 7372, "belt</w>": 7373, "pine</w>": 7374, "wine": 7375, "xi</w>": 7376, "employees</w>": 7377, "logies</w>": 7378, "alled</w>": 7379, "demo</w>": 7380, "birthday": 7381, "angeles</w>": 7382, "log</w>": 7383, "drivers</w>": 7384, "necklace</w>": 7385, "kath": 7386, "sit": 7387, "athlete</w>": 7388, "efs</w>": 7389, "sburg</w>": 7390, "purpose</w>": 7391, "resistance</w>": 7392, "releases</w>": 7393, "tis</w>": 7394, "various</w>": 7395, "deliver</w>": 7396, "chal": 7397, "sanc": 7398, "oppo": 7399, "craw": 7400, "neuro": 7401, "dra</w>": 7402, "supporters</w>": 7403, "snap</w>": 7404, "difficult</w>": 7405, "swear</w>": 7406, "logist</w>": 7407, "path": 7408, "attempt</w>": 7409, "à¥": 7410, "swimming</w>": 7411, "steve": 7412, "hurt</w>": 7413, "included</w>": 7414, "bap": 7415, "ware": 7416, "ðŁĴĭ</w>": 7417, "enders</w>": 7418, "jake</w>": 7419, "leeds</w>": 7420, "climb": 7421, "lb</w>": 7422, "imple": 7423, "lisa</w>": 7424, "clothing</w>": 7425, "ðŁĺİ": 7426, "dt</w>": 7427, "compla": 7428, "swing</w>": 7429, "straw": 7430, "vals</w>": 7431, "kle</w>": 7432, "users</w>": 7433, "storm": 7434, "cuts</w>": 7435, "ontario</w>": 7436, "pan</w>": 7437, "handsome</w>": 7438, "iow": 7439, "argu": 7440, "checking</w>": 7441, "scottish</w>": 7442, "Ķï¸ı</w>": 7443, "sier</w>": 7444, "emma</w>": 7445, "pod</w>": 7446, "pattern</w>": 7447, "desh</w>": 7448, "enh": 7449, "edward</w>": 7450, "ting": 7451, "kh</w>": 7452, "half": 7453, "lincoln</w>": 7454, "mother": 7455, "alleg": 7456, "rc</w>": 7457, "volleyball</w>": 7458, "dn</w>": 7459, "gay": 7460, "ally": 7461, "leton</w>": 7462, "grove</w>": 7463, "loud</w>": 7464, "advanced</w>": 7465, "respec": 7466, "client</w>": 7467, "supreme</w>": 7468, "thailand</w>": 7469, "how": 7470, "gig</w>": 7471, "toi": 7472, "dot": 7473, "dollar</w>": 7474, "ðŁĳĩ</w>": 7475, "pit</w>": 7476, "rb</w>": 7477, "hn</w>": 7478, "produced</w>": 7479, "ggers</w>": 7480, "âĨĴ</w>": 7481, "mlb</w>": 7482, "canvas</w>": 7483, "fineart": 7484, "usd</w>": 7485, "inthe": 7486, "pson</w>": 7487, "actual</w>": 7488, "sl</w>": 7489, "tb</w>": 7490, "ipad</w>": 7491, "ensure</w>": 7492, "umb": 7493, "wd</w>": 7494, "ska</w>": 7495, "mars</w>": 7496, "kend": 7497, "feli": 7498, "thing": 7499, "countdown</w>": 7500, "absolute</w>": 7501, "rout": 7502, "dral</w>": 7503, "py</w>": 7504, "injured</w>": 7505, "mint</w>": 7506, "hunting</w>": 7507, "mmer</w>": 7508, "sage</w>": 7509, "ligh": 7510, "acity</w>": 7511, "expan": 7512, "murray</w>": 7513, "aro": 7514, "secure</w>": 7515, "fourth</w>": 7516, "eagle</w>": 7517, "relief</w>": 7518, "stakes</w>": 7519, "industrial</w>": 7520, "clark</w>": 7521, "understanding</w>": 7522, "seem</w>": 7523, "plenty</w>": 7524, "silver": 7525, "clau": 7526, "threat": 7527, "sail": 7528, "produce</w>": 7529, "abstr": 7530, "isis</w>": 7531, "br</w>": 7532, "engers</w>": 7533, "worry</w>": 7534, "bieber</w>": 7535, "sj": 7536, "justin": 7537, "realize</w>": 7538, "kyle</w>": 7539, "espn</w>": 7540, "filter</w>": 7541, "sch</w>": 7542, "types</w>": 7543, "gamedev</w>": 7544, "ding": 7545, "twitter": 7546, "soldiers</w>": 7547, "pom": 7548, "carbon</w>": 7549, "yards</w>": 7550, "childhood</w>": 7551, "ried</w>": 7552, "kel</w>": 7553, "eleph": 7554, "tons</w>": 7555, "keynote</w>": 7556, "quiet</w>": 7557, "wire": 7558, "posting</w>": 7559, "issa</w>": 7560, "representing</w>": 7561, "backs</w>": 7562, "alexander</w>": 7563, "celebrates</w>": 7564, "taining</w>": 7565, "||</w>": 7566, "chor": 7567, "escape</w>": 7568, "peek</w>": 7569, "tives</w>": 7570, "field": 7571, "ssie</w>": 7572, "impac": 7573, "sponsor</w>": 7574, "rc": 7575, "wedd": 7576, "cannab": 7577, "sides</w>": 7578, "tracks</w>": 7579, "compar": 7580, "contrac": 7581, "technical</w>": 7582, "bible</w>": 7583, "exploring</w>": 7584, "share": 7585, "trav": 7586, "nate</w>": 7587, "illo</w>": 7588, "scru": 7589, "mingham</w>": 7590, "guns</w>": 7591, "ofthe": 7592, "shame</w>": 7593, "sees</w>": 7594, "catho": 7595, "access": 7596, "cel</w>": 7597, "reported</w>": 7598, "Â»</w>": 7599, "mario</w>": 7600, "pad</w>": 7601, "hopefully</w>": 7602, "ouse</w>": 7603, "yon</w>": 7604, "disappo": 7605, "olo</w>": 7606, "pitt": 7607, "pac</w>": 7608, "gap</w>": 7609, "crush</w>": 7610, "sg</w>": 7611, "kle": 7612, "gem</w>": 7613, "empire</w>": 7614, "dirty</w>": 7615, "ais": 7616, "aviation</w>": 7617, "zealand</w>": 7618, "facing</w>": 7619, "highway</w>": 7620, "danny</w>": 7621, "spider</w>": 7622, "otta": 7623, "ðŁĺĦ</w>": 7624, "wy</w>": 7625, "colours</w>": 7626, "infl": 7627, "costs</w>": 7628, "olympics</w>": 7629, "aus</w>": 7630, "hm</w>": 7631, "howard</w>": 7632, "passes</w>": 7633, "lauren</w>": 7634, "mush": 7635, "opin": 7636, "rho": 7637, "discount</w>": 7638, "operation</w>": 7639, "emily</w>": 7640, "mmm</w>": 7641, "chamber</w>": 7642, "dil": 7643, "toyo": 7644, "ship": 7645, "samu": 7646, "pictured</w>": 7647, "unic": 7648, "pol</w>": 7649, "keeper</w>": 7650, "cartoon</w>": 7651, "sten": 7652, "ignor": 7653, "nations</w>": 7654, "nl</w>": 7655, "tasting</w>": 7656, "detail</w>": 7657, "officials</w>": 7658, "motor</w>": 7659, "francis</w>": 7660, "editor</w>": 7661, "ðŁĳĩ": 7662, "pets</w>": 7663, "rangers</w>": 7664, "tg": 7665, "rn</w>": 7666, "wri": 7667, "nichol": 7668, "ise": 7669, "spots</w>": 7670, "anie</w>": 7671, "check": 7672, "triple</w>": 7673, "kumar</w>": 7674, "speakers</w>": 7675, "icing</w>": 7676, "prepared</w>": 7677, "abuse</w>": 7678, "friendship</w>": 7679, "month": 7680, "swim</w>": 7681, "aire</w>": 7682, "scent</w>": 7683, "hamilton</w>": 7684, "indian": 7685, "jes": 7686, "yummy</w>": 7687, "tears</w>": 7688, "dawn</w>": 7689, "ized</w>": 7690, "worlds</w>": 7691, "ðŁķ": 7692, "billi": 7693, "stone": 7694, "nhs</w>": 7695, "basic</w>": 7696, "por</w>": 7697, "stle</w>": 7698, "iron": 7699, "older</w>": 7700, "clevel": 7701, "eing</w>": 7702, "ðŁĺįðŁĺįðŁĺį</w>": 7703, "prints</w>": 7704, "firm</w>": 7705, "aircraft</w>": 7706, "finest</w>": 7707, "develop</w>": 7708, "aaron</w>": 7709, "tz": 7710, "graham</w>": 7711, "owners</w>": 7712, "foli": 7713, "lesson</w>": 7714, "ques</w>": 7715, "babe</w>": 7716, "craft": 7717, "phen": 7718, "jun</w>": 7719, "birmingham</w>": 7720, "vine</w>": 7721, "ller</w>": 7722, "ian": 7723, "fineartamerica</w>": 7724, "evolu": 7725, "stab": 7726, "imper": 7727, "ward": 7728, "comic": 7729, "wiz": 7730, "invited</w>": 7731, "duke</w>": 7732, "match": 7733, "ports</w>": 7734, "roger</w>": 7735, "diagno": 7736, "kept</w>": 7737, "test": 7738, "visu": 7739, "rhy": 7740, "soc</w>": 7741, "tox": 7742, "baker</w>": 7743, "surface</w>": 7744, "covers</w>": 7745, "mans</w>": 7746, "bits</w>": 7747, "xbox</w>": 7748, "ffle</w>": 7749, "nan</w>": 7750, "gard": 7751, "hart</w>": 7752, "waters</w>": 7753, "villa</w>": 7754, "retro</w>": 7755, "lightning</w>": 7756, "catholic</w>": 7757, "democracy</w>": 7758, "neighbor": 7759, "penn": 7760, "cran": 7761, "jonathan</w>": 7762, "laura</w>": 7763, "vibes</w>": 7764, "sub</w>": 7765, "coaching</w>": 7766, "clearly</w>": 7767, "ukraine</w>": 7768, "brave</w>": 7769, "commitment</w>": 7770, "tall</w>": 7771, "mart</w>": 7772, "rap": 7773, "modi</w>": 7774, "scott": 7775, "bros</w>": 7776, "shower</w>": 7777, "ðŁı¾</w>": 7778, "âĺºï¸ı</w>": 7779, "cousin</w>": 7780, "approach": 7781, "bre</w>": 7782, "compos": 7783, "hilari": 7784, "philly</w>": 7785, "gad": 7786, "quickly</w>": 7787, "rian</w>": 7788, "tm</w>": 7789, "virtual</w>": 7790, "houses</w>": 7791, "kt</w>": 7792, "phoenix</w>": 7793, "wire</w>": 7794, "ffy</w>": 7795, "bunch</w>": 7796, "ancing</w>": 7797, "tale</w>": 7798, "snapchat</w>": 7799, "starter</w>": 7800, "ht</w>": 7801, "kicking</w>": 7802, "apart</w>": 7803, "thy": 7804, ")!</w>": 7805, "blogger</w>": 7806, "itz</w>": 7807, "comfort</w>": 7808, "angels</w>": 7809, "wash</w>": 7810, "\":</w>": 7811, "argent": 7812, "request</w>": 7813, "honest": 7814, "mighty</w>": 7815, "bobby</w>": 7816, "kg</w>": 7817, "rol</w>": 7818, "thouse</w>": 7819, "expo": 7820, "hc</w>": 7821, "tables</w>": 7822, "magical</w>": 7823, "posts</w>": 7824, "dem</w>": 7825, "nw": 7826, "orlando</w>": 7827, "aber": 7828, "***</w>": 7829, "ðŁĺľ</w>": 7830, "environmental</w>": 7831, "transformation</w>": 7832, "mile": 7833, "wic": 7834, "hiring</w>": 7835, "maine</w>": 7836, "boar": 7837, "rying</w>": 7838, "tis": 7839, "niture</w>": 7840, "tweeted</w>": 7841, "antonio</w>": 7842, "opinion</w>": 7843, "finale</w>": 7844, "diy</w>": 7845, "fis": 7846, "thin</w>": 7847, "trouble</w>": 7848, "lego</w>": 7849, "files</w>": 7850, "quart": 7851, "spa": 7852, "currency</w>": 7853, "climate": 7854, "fanart</w>": 7855, "railway</w>": 7856, "space": 7857, "bands</w>": 7858, "daniel": 7859, "motion</w>": 7860, "leng": 7861, "holder</w>": 7862, "occu": 7863, "marie</w>": 7864, "cathedral</w>": 7865, "buzz": 7866, "bies</w>": 7867, "nascar</w>": 7868, "bmw</w>": 7869, "battery</w>": 7870, "charlotte</w>": 7871, "doctor": 7872, "zzle</w>": 7873, "seven": 7874, "insan": 7875, "ddy</w>": 7876, "sten</w>": 7877, "labor</w>": 7878, "thrilled</w>": 7879, "seren": 7880, "documentary</w>": 7881, "waves</w>": 7882, "certain</w>": 7883, "candid": 7884, "allowed</w>": 7885, "nintendo</w>": 7886, "starwars</w>": 7887, "tap</w>": 7888, "homemade</w>": 7889, "dles</w>": 7890, "thering</w>": 7891, "bree": 7892, "empty</w>": 7893, "piano</w>": 7894, "positi": 7895, "country": 7896, "pork</w>": 7897, "puts</w>": 7898, "perry</w>": 7899, "matic</w>": 7900, "spotlight</w>": 7901, "tist</w>": 7902, "orities</w>": 7903, "wealth</w>": 7904, "cp": 7905, "barbar": 7906, "committed</w>": 7907, "assau": 7908, "profit</w>": 7909, "eight</w>": 7910, "hul": 7911, "finishing</w>": 7912, "runner</w>": 7913, "sso</w>": 7914, "inspec": 7915, "charged</w>": 7916, "christop": 7917, "losing</w>": 7918, "coal</w>": 7919, "hoo</w>": 7920, "elev": 7921, "dele": 7922, "moham": 7923, "donation</w>": 7924, "cable</w>": 7925, "clinic</w>": 7926, "jin": 7927, "managed</w>": 7928, "tering</w>": 7929, "â¬": 7930, "urban": 7931, "deputy</w>": 7932, "bber</w>": 7933, "burn": 7934, "academic</w>": 7935, "ott</w>": 7936, "stake</w>": 7937, "iter": 7938, "stown</w>": 7939, "acker</w>": 7940, "adventures</w>": 7941, "adams</w>": 7942, "greg</w>": 7943, "prom</w>": 7944, "vol</w>": 7945, "acqu": 7946, "congre": 7947, "paint": 7948, "citizens</w>": 7949, "call": 7950, "afford": 7951, "vc</w>": 7952, "asks</w>": 7953, "thetic</w>": 7954, "independence</w>": 7955, "âĽ": 7956, "hitting</w>": 7957, "blon": 7958, "future": 7959, "âı": 7960, "inno": 7961, "gene</w>": 7962, "boards</w>": 7963, "distance</w>": 7964, "set": 7965, "remem": 7966, "thal": 7967, "prevent</w>": 7968, "lang": 7969, "objec": 7970, "susp": 7971, "matt": 7972, "induc": 7973, "boro</w>": 7974, "pione": 7975, "redi": 7976, "virtu": 7977, "printed</w>": 7978, "scope</w>": 7979, "shark</w>": 7980, "succe": 7981, "astron": 7982, "illegal</w>": 7983, "jag": 7984, "cting</w>": 7985, "inee</w>": 7986, "ato": 7987, "robin</w>": 7988, "nutrition</w>": 7989, "bf</w>": 7990, "dutch</w>": 7991, "bn</w>": 7992, "furniture</w>": 7993, "forgotten</w>": 7994, "atar</w>": 7995, "rup": 7996, "hyper": 7997, "branch</w>": 7998, "communication</w>": 7999, "degrees</w>": 8000, "onia</w>": 8001, "uncle</w>": 8002, "promote</w>": 8003, "orche": 8004, "wii</w>": 8005, "js</w>": 8006, "button</w>": 8007, "major": 8008, "cbs</w>": 8009, "bristol</w>": 8010, "premium</w>": 8011, "ordinary</w>": 8012, "edit</w>": 8013, "mg</w>": 8014, "weed</w>": 8015, "steven</w>": 8016, ":'": 8017, "gus</w>": 8018, "tes": 8019, "captured</w>": 8020, "drugs</w>": 8021, "dow": 8022, "writes</w>": 8023, "bishop</w>": 8024, "wheels</w>": 8025, "alization</w>": 8026, "discovery</w>": 8027, "wr</w>": 8028, "rachel</w>": 8029, "neil</w>": 8030, "hydr": 8031, "cutest</w>": 8032, "entrepreneur</w>": 8033, "korean</w>": 8034, "oregon</w>": 8035, "ulty</w>": 8036, "perfectly</w>": 8037, "supported</w>": 8038, "historical</w>": 8039, "twins</w>": 8040, "elly": 8041, "wel</w>": 8042, "devil</w>": 8043, "income</w>": 8044, "scientists</w>": 8045, "deleg": 8046, "hen</w>": 8047, "oni</w>": 8048, "iced</w>": 8049, "gio</w>": 8050, "curry</w>": 8051, "reveal</w>": 8052, "eg": 8053, "buffalo</w>": 8054, "nol": 8055, "opera</w>": 8056, "cameron</w>": 8057, "hahahaha": 8058, "jab": 8059, "graduation</w>": 8060, "craig</w>": 8061, "ral": 8062, "if": 8063, "organization</w>": 8064, "lege</w>": 8065, "gang": 8066, "sud": 8067, "edinburgh</w>": 8068, "lack</w>": 8069, "flies</w>": 8070, "gate": 8071, "thrones</w>": 8072, "qb</w>": 8073, "thereal": 8074, "eleg": 8075, "ppin</w>": 8076, "cles</w>": 8077, "jamie</w>": 8078, "tnam</w>": 8079, "crypto": 8080, "oul</w>": 8081, "pages</w>": 8082, "ase": 8083, "roots</w>": 8084, "stupid</w>": 8085, "adid": 8086, "boot</w>": 8087, "protein</w>": 8088, "sap": 8089, "sium</w>": 8090, "sus": 8091, "endor": 8092, "function</w>": 8093, "dont": 8094, "enna</w>": 8095, "chy</w>": 8096, "sque</w>": 8097, "worker</w>": 8098, "mtv": 8099, "ea</w>": 8100, "kan</w>": 8101, "ðŁĴļ": 8102, "mus</w>": 8103, "profession": 8104, "tto</w>": 8105, "operations</w>": 8106, "allo": 8107, "ctor</w>": 8108, "invite</w>": 8109, "scand": 8110, "outh</w>": 8111, "zim": 8112, "links</w>": 8113, "clients</w>": 8114, "samsung</w>": 8115, "discusses</w>": 8116, "nell</w>": 8117, "ultra</w>": 8118, "somewhere</w>": 8119, "stewart</w>": 8120, "inet</w>": 8121, "dez</w>": 8122, "bout</w>": 8123, "factor</w>": 8124, "tian</w>": 8125, "trans</w>": 8126, "jeremy</w>": 8127, "db</w>": 8128, "ðŁĩ¬": 8129, "orn</w>": 8130, "developing</w>": 8131, "spol</w>": 8132, "cooper</w>": 8133, "mau": 8134, "remembering</w>": 8135, "trek</w>": 8136, "family": 8137, "seniors</w>": 8138, "foster</w>": 8139, "attended</w>": 8140, "wing": 8141, "transform": 8142, "elementary</w>": 8143, "horiz": 8144, "listing</w>": 8145, "malaysia</w>": 8146, "itch</w>": 8147, "warrior</w>": 8148, "philippines</w>": 8149, "russell</w>": 8150, "mend": 8151, "initiative</w>": 8152, "creep": 8153, "tops</w>": 8154, "briti": 8155, "aur": 8156, "sharp</w>": 8157, "advertising</w>": 8158, "ugly</w>": 8159, "achiev": 8160, "materials</w>": 8161, "bug</w>": 8162, "device</w>": 8163, "bonus</w>": 8164, "facility</w>": 8165, "cole</w>": 8166, "nhl</w>": 8167, "yas": 8168, "planned</w>": 8169, "pole</w>": 8170, "excellence</w>": 8171, "trick</w>": 8172, "confl": 8173, "rp</w>": 8174, "achieve</w>": 8175, "loan</w>": 8176, "swag</w>": 8177, "jessica</w>": 8178, "howe": 8179, "pour</w>": 8180, "scu": 8181, "zoo": 8182, "rated</w>": 8183, "dresses</w>": 8184, "rebel": 8185, "mexican</w>": 8186, "coordin": 8187, "mess</w>": 8188, "atlantic</w>": 8189, "tl</w>": 8190, "oscar</w>": 8191, "walks</w>": 8192, "pharmac": 8193, "investigation</w>": 8194, "...#</w>": 8195, "cci</w>": 8196, "easily</w>": 8197, "mondaymotivation</w>": 8198, "yment</w>": 8199, "auti": 8200, "forced</w>": 8201, "armed</w>": 8202, "colleagues</w>": 8203, "papers</w>": 8204, "proper</w>": 8205, "shake": 8206, "buc": 8207, "lean</w>": 8208, "exhibit</w>": 8209, "evement</w>": 8210, "cott": 8211, "biz": 8212, "sper": 8213, "kent</w>": 8214, "swan": 8215, "/@</w>": 8216, "girlfriend</w>": 8217, "hawk</w>": 8218, "âĺĢï¸ı</w>": 8219, "mono": 8220, "ðŁĴĽ": 8221, "statue</w>": 8222, "ðŁĺ³</w>": 8223, "ras</w>": 8224, "teeth</w>": 8225, "precious</w>": 8226, "tile</w>": 8227, "pam": 8228, "swift</w>": 8229, "vali": 8230, "nose</w>": 8231, "drunk</w>": 8232, "experiences</w>": 8233, "comeback</w>": 8234, "genius</w>": 8235, "worse</w>": 8236, "shef": 8237, "rad</w>": 8238, "edit": 8239, "honour</w>": 8240, "auspol</w>": 8241, "larry</w>": 8242, "hire</w>": 8243, "gordon</w>": 8244, "achievement</w>": 8245, "........": 8246, "suicide</w>": 8247, "alternative</w>": 8248, "sup</w>": 8249, "surroun": 8250, "shake</w>": 8251, "keith</w>": 8252, "pepper</w>": 8253, "turk": 8254, "criminal</w>": 8255, "beck": 8256, "sum</w>": 8257, "walls</w>": 8258, "cnn</w>": 8259, "antic": 8260, "offe": 8261, "colli": 8262, "wines</w>": 8263, "highlight</w>": 8264, "hawaii</w>": 8265, "embar": 8266, "lfc</w>": 8267, "ðŁĩ®": 8268, "mv</w>": 8269, ">>": 8270, "atmo": 8271, "word": 8272, "carl": 8273, "shoutout</w>": 8274, "brewing</w>": 8275, "ìĿ": 8276, "dof": 8277, "sic": 8278, "hottest</w>": 8279, "colon": 8280, "hhh</w>": 8281, "shut</w>": 8282, "lowing</w>": 8283, "volume</w>": 8284, "apartment</w>": 8285, "agreement</w>": 8286, "destro": 8287, "wee</w>": 8288, "religious</w>": 8289, "iowa</w>": 8290, "rod</w>": 8291, "landing</w>": 8292, "represent": 8293, "ðŁĵ·:</w>": 8294, "las": 8295, "usually</w>": 8296, "hl": 8297, "cac": 8298, "salv": 8299, "along": 8300, "laughing</w>": 8301, "beans</w>": 8302, "reminds</w>": 8303, "phase</w>": 8304, "somebody</w>": 8305, "mask</w>": 8306, "ranked</w>": 8307, "destroy": 8308, "sci</w>": 8309, "âĢ¼ï¸ı</w>": 8310, "gabri": 8311, "leo</w>": 8312, "roa": 8313, "failed</w>": 8314, "sil</w>": 8315, "refugees</w>": 8316, "revi": 8317, "ring": 8318, "berries</w>": 8319, "cookies</w>": 8320, "yy</w>": 8321, "conservation</w>": 8322, "shab": 8323, "humans</w>": 8324, "determin": 8325, "ain": 8326, "niall</w>": 8327, "assu": 8328, "mba</w>": 8329, "from": 8330, "extreme</w>": 8331, "vices</w>": 8332, "commerce</w>": 8333, "ghtful</w>": 8334, "ordered</w>": 8335, "supports</w>": 8336, "recap</w>": 8337, "vor": 8338, "dropping</w>": 8339, "correct</w>": 8340, "paying</w>": 8341, "meaning</w>": 8342, "nj": 8343, "quiz</w>": 8344, "\"#</w>": 8345, "business": 8346, "ðŁĩ®ðŁĩ": 8347, "indigen": 8348, "dust</w>": 8349, "boxes</w>": 8350, "blind</w>": 8351, "xxx</w>": 8352, "zzy</w>": 8353, "ðŁĩ¬ðŁĩ": 8354, "ssels</w>": 8355, "sant": 8356, "ddle</w>": 8357, "hilarious</w>": 8358, "design": 8359, "wondering</w>": 8360, "vehicles</w>": 8361, "kre": 8362, "jud": 8363, "reception</w>": 8364, "parker</w>": 8365, "ÃŃ": 8366, "privi": 8367, "hydro": 8368, "softball</w>": 8369, "pollu": 8370, "locked</w>": 8371, "bah": 8372, "ear</w>": 8373, "script</w>": 8374, "divi": 8375, "brace": 8376, "george": 8377, "theast</w>": 8378, "belo": 8379, "jal": 8380, "tionary</w>": 8381, "dental</w>": 8382, "rocket</w>": 8383, "purch": 8384, "shak": 8385, "manufacturing</w>": 8386, "ez</w>": 8387, "itis</w>": 8388, "concep": 8389, "tball": 8390, "chs</w>": 8391, "directed</w>": 8392, "prayers</w>": 8393, "ook</w>": 8394, "philos": 8395, "variety</w>": 8396, "chess</w>": 8397, "server</w>": 8398, "gand": 8399, "balti": 8400, "ðŁĵ¸": 8401, "sely</w>": 8402, "cruz</w>": 8403, "spectacular</w>": 8404, "burning</w>": 8405, "represent</w>": 8406, "iz</w>": 8407, "tone</w>": 8408, "merce": 8409, "hell": 8410, "bedroom</w>": 8411, "establi": 8412, "bol</w>": 8413, "common": 8414, "ãĥ»": 8415, "abor": 8416, "kitty</w>": 8417, "heights</w>": 8418, "repair</w>": 8419, "william": 8420, "quake</w>": 8421, "alabama</w>": 8422, "population</w>": 8423, "rev": 8424, "rett</w>": 8425, "ists</w>": 8426, "nite</w>": 8427, "lem</w>": 8428, "aha</w>": 8429, "cleveland</w>": 8430, "rm</w>": 8431, "pover": 8432, "obse": 8433, "montre": 8434, "mania</w>": 8435, "Â®</w>": 8436, "conne": 8437, "carni": 8438, "shah</w>": 8439, "fy": 8440, "ua</w>": 8441, "scor": 8442, "struggle</w>": 8443, "bob": 8444, "''</w>": 8445, "appropri": 8446, "decide</w>": 8447, "ffed</w>": 8448, "caster</w>": 8449, "sort</w>": 8450, "hungry</w>": 8451, "drag": 8452, "Ø§Ù": 8453, "grounds</w>": 8454, "dw": 8455, "slightly</w>": 8456, "cardin": 8457, "deadline</w>": 8458, "bronze</w>": 8459, "webin": 8460, "barry</w>": 8461, "silence</w>": 8462, "euro</w>": 8463, "option</w>": 8464, "earn</w>": 8465, "ðŁĴĸ": 8466, "however</w>": 8467, "naren": 8468, "nails</w>": 8469, "bathroom</w>": 8470, "vine": 8471, "phd</w>": 8472, "mining</w>": 8473, "garage</w>": 8474, "()</w>": 8475, "shoulder</w>": 8476, "defeat</w>": 8477, "dir</w>": 8478, "ov</w>": 8479, "liberty</w>": 8480, "pleas": 8481, "xon</w>": 8482, "compre": 8483, "av</w>": 8484, "jin</w>": 8485, "ables</w>": 8486, "silent</w>": 8487, "famili": 8488, "visits</w>": 8489, "dipl": 8490, "habit": 8491, "millions</w>": 8492, "regarding</w>": 8493, "innovative</w>": 8494, "senator</w>": 8495, "rts</w>": 8496, "von</w>": 8497, "kl": 8498, "whil": 8499, "required</w>": 8500, "âĿĦ": 8501, "luv</w>": 8502, "presidential</w>": 8503, "pocket</w>": 8504, "hundre": 8505, "shown</w>": 8506, "frozen</w>": 8507, "toward</w>": 8508, "fast": 8509, "confidence</w>": 8510, "rough</w>": 8511, "individual</w>": 8512, "quet</w>": 8513, "ðŁı½": 8514, "dome": 8515, "fifa</w>": 8516, "engineer</w>": 8517, "zen": 8518, "remix</w>": 8519, "ðŁĺĥ</w>": 8520, "plant": 8521, "minor</w>": 8522, "robinson</w>": 8523, "asy": 8524, "pulled</w>": 8525, "certain": 8526, "potato</w>": 8527, "(:</w>": 8528, "pres</w>": 8529, "occa": 8530, "wit</w>": 8531, "item</w>": 8532, "sie": 8533, "dating</w>": 8534, "thompson</w>": 8535, "owned</w>": 8536, "anu": 8537, "vie</w>": 8538, "tedly</w>": 8539, "goodnight</w>": 8540, "except</w>": 8541, "ðŁĮŁ</w>": 8542, "iraq</w>": 8543, "kie": 8544, "rences</w>": 8545, "lip</w>": 8546, "similar</w>": 8547, "saudi</w>": 8548, "vig": 8549, "arthur</w>": 8550, "picks</w>": 8551, "milan</w>": 8552, "honda</w>": 8553, "maxi": 8554, "og</w>": 8555, "stest</w>": 8556, "arch</w>": 8557, "analytics</w>": 8558, "basti": 8559, "pearl</w>": 8560, "terry</w>": 8561, "horse": 8562, "astro": 8563, "acce": 8564, "launching</w>": 8565, "international": 8566, "sno": 8567, "tasty</w>": 8568, "denver</w>": 8569, "irl</w>": 8570, "pete</w>": 8571, "torn": 8572, "advantage</w>": 8573, "varsity</w>": 8574, "\"\"</w>": 8575, "sole</w>": 8576, "gc": 8577, "lang</w>": 8578, "demonstr": 8579, "olds</w>": 8580, "unity</w>": 8581, "nets</w>": 8582, "inspire</w>": 8583, "crete</w>": 8584, "nashville</w>": 8585, "nelson</w>": 8586, "eter": 8587, "walk": 8588, "hyun</w>": 8589, "mack": 8590, "treas": 8591, "seeking</w>": 8592, "rage</w>": 8593, "brush</w>": 8594, "aband": 8595, "whilst</w>": 8596, "cocon": 8597, "hong</w>": 8598, "shelter</w>": 8599, "ip</w>": 8600, "possibly</w>": 8601, "soo</w>": 8602, "ited": 8603, "âĦ": 8604, "races</w>": 8605, "warming</w>": 8606, "quin": 8607, "television</w>": 8608, "matches</w>": 8609, "rapi": 8610, "mental": 8611, "palm</w>": 8612, "jennifer</w>": 8613, "rolls</w>": 8614, "indiana</w>": 8615, "bars</w>": 8616, "catching</w>": 8617, "rescu": 8618, "candidates</w>": 8619, "fare": 8620, "âłĢ</w>": 8621, "seo</w>": 8622, "vietnam</w>": 8623, "alpha</w>": 8624, "michelle</w>": 8625, "visible</w>": 8626, "regre": 8627, "wned</w>": 8628, "apple": 8629, "lip": 8630, "ffe</w>": 8631, "liz": 8632, "yorkshire</w>": 8633, "hail</w>": 8634, "seasons</w>": 8635, "began</w>": 8636, "md": 8637, "kc</w>": 8638, "lap</w>": 8639, "fascinating</w>": 8640, "help": 8641, "ury": 8642, "ums</w>": 8643, "nuts</w>": 8644, "sem": 8645, "alongside</w>": 8646, "bridge": 8647, "orial</w>": 8648, "ove": 8649, "worldcup</w>": 8650, "british": 8651, "comfortable</w>": 8652, "ive</w>": 8653, "hotels</w>": 8654, "fairs</w>": 8655, "horri": 8656, "sox</w>": 8657, "dining</w>": 8658, "stream": 8659, "barri": 8660, "ssy</w>": 8661, "wim": 8662, "terms</w>": 8663, "vu": 8664, "pere": 8665, "lens</w>": 8666, "walked</w>": 8667, "ror": 8668, "lars</w>": 8669, "shield</w>": 8670, "doubt</w>": 8671, "proto": 8672, "crossing</w>": 8673, "meant</w>": 8674, "medium</w>": 8675, "adding</w>": 8676, "eb</w>": 8677, "cheap</w>": 8678, "func": 8679, "paper": 8680, "brands</w>": 8681, "ryan": 8682, "feedback</w>": 8683, "collins</w>": 8684, "unknown</w>": 8685, "tropical</w>": 8686, "sandwich</w>": 8687, "fallen</w>": 8688, "formu": 8689, "select</w>": 8690, "loads</w>": 8691, "answers</w>": 8692, "ori</w>": 8693, "maga</w>": 8694, "dor</w>": 8695, "duo</w>": 8696, "alie</w>": 8697, "drum</w>": 8698, "uri</w>": 8699, "deer</w>": 8700, "soul": 8701, "shut": 8702, "âĺº</w>": 8703, "stolen</w>": 8704, "donated</w>": 8705, "buzz</w>": 8706, "patriots</w>": 8707, "hal</w>": 8708, "nasty</w>": 8709, "nominated</w>": 8710, "monte": 8711, "kia</w>": 8712, "thri": 8713, "ingu": 8714, "tests</w>": 8715, "petro": 8716, "ðŁĳĳ</w>": 8717, "hosts</w>": 8718, "nest</w>": 8719, "topic</w>": 8720, "patch</w>": 8721, "mmy</w>": 8722, "hugh": 8723, "abilities</w>": 8724, "mathe": 8725, "smiles</w>": 8726, "gb": 8727, "agenda</w>": 8728, "insights</w>": 8729, "chip</w>": 8730, "phan": 8731, "failure</w>": 8732, "dgers</w>": 8733, "hai": 8734, "significant</w>": 8735, "shock</w>": 8736, "rural</w>": 8737, "glam": 8738, "figures</w>": 8739, "potus</w>": 8740, "ota</w>": 8741, "ministry</w>": 8742, "appears</w>": 8743, "fear": 8744, "rh": 8745, "american": 8746, "hatt": 8747, "sony</w>": 8748, "fires</w>": 8749, "edi": 8750, "nou": 8751, "equi": 8752, "when": 8753, "universal</w>": 8754, "madness</w>": 8755, "ix</w>": 8756, "sculpture</w>": 8757, "bach</w>": 8758, "tto": 8759, "sweden</w>": 8760, "eta</w>": 8761, "ento</w>": 8762, "developed</w>": 8763, "monthly</w>": 8764, "maps</w>": 8765, "rah</w>": 8766, "led": 8767, "delta</w>": 8768, "saints</w>": 8769, "islam</w>": 8770, "bench</w>": 8771, "fifth</w>": 8772, "vard</w>": 8773, "socks</w>": 8774, "welcoming</w>": 8775, "je</w>": 8776, "turner</w>": 8777, "vb</w>": 8778, "adi</w>": 8779, "norway</w>": 8780, "ady</w>": 8781, "hurricane</w>": 8782, "porsche</w>": 8783, "tradition</w>": 8784, "exam</w>": 8785, "newspaper</w>": 8786, "luci": 8787, "aver": 8788, "ideal</w>": 8789, "dna</w>": 8790, "madison</w>": 8791, "ðŁ§": 8792, "witness</w>": 8793, "acou": 8794, "insight</w>": 8795, "simon": 8796, "robot</w>": 8797, "snake</w>": 8798, "nbc</w>": 8799, "aco</w>": 8800, "ross": 8801, "shment</w>": 8802, "religion</w>": 8803, "chann": 8804, "insu": 8805, "campbell</w>": 8806, "installed</w>": 8807, "weather": 8808, "horses</w>": 8809, "oli</w>": 8810, "robert": 8811, "kaz": 8812, "ðŁıĢ</w>": 8813, "veteran</w>": 8814, "thread</w>": 8815, "quarter": 8816, "easier</w>": 8817, "capture</w>": 8818, "hipho": 8819, "lawrence</w>": 8820, "romantic</w>": 8821, "passion": 8822, "clay</w>": 8823, "oxford</w>": 8824, "thai</w>": 8825, "studying</w>": 8826, "fia</w>": 8827, "elected</w>": 8828, "mostly</w>": 8829, "cb</w>": 8830, "tumb": 8831, "âĢįâĻĤ": 8832, "xl</w>": 8833, "shan</w>": 8834, "faster</w>": 8835, "evans</w>": 8836, "slide</w>": 8837, "shri": 8838, "seek</w>": 8839, "mies</w>": 8840, "chemistry</w>": 8841, "pumpkin</w>": 8842, "tum</w>": 8843, ",,</w>": 8844, "room": 8845, "fired</w>": 8846, "lips</w>": 8847, "presence</w>": 8848, "aff": 8849, "brewery</w>": 8850, "arrive</w>": 8851, "swag": 8852, "photograph</w>": 8853, "pengu": 8854, "chips</w>": 8855, "attor": 8856, "values</w>": 8857, "accurate</w>": 8858, "contemporary</w>": 8859, "principal</w>": 8860, "cannabis</w>": 8861, "ario</w>": 8862, "anywhere</w>": 8863, "gia</w>": 8864, "democrats</w>": 8865, "buildings</w>": 8866, "lived</w>": 8867, "aps</w>": 8868, "negative</w>": 8869, "mare</w>": 8870, "ballo": 8871, "lion": 8872, "diamon": 8873, "look": 8874, "reform</w>": 8875, "tommy</w>": 8876, "illa": 8877, "treats</w>": 8878, "hundreds</w>": 8879, "portland</w>": 8880, "worthy</w>": 8881, "excep": 8882, "aria</w>": 8883, "idol</w>": 8884, "beer": 8885, "cdn": 8886, "yu</w>": 8887, "awk": 8888, "ðŁĩ¨": 8889, "cells</w>": 8890, "Ã³": 8891, "identity</w>": 8892, "drawn</w>": 8893, "devil": 8894, "finger</w>": 8895, "tham</w>": 8896, "ðŁĳĬ": 8897, "earned</w>": 8898, "fintech</w>": 8899, "dolph": 8900, "tweeting</w>": 8901, "evolution</w>": 8902, "ðŁĵį</w>": 8903, "estim": 8904, "mvp</w>": 8905, "none</w>": 8906, "ðŁĩºðŁĩ¸": 8907, "toyota</w>": 8908, "aux</w>": 8909, "marin": 8910, "bold</w>": 8911, "lbs</w>": 8912, "steak</w>": 8913, "murphy</w>": 8914, "itable</w>": 8915, "louis": 8916, "solve</w>": 8917, "pia</w>": 8918, "skir": 8919, "illino": 8920, "webinar</w>": 8921, "banana</w>": 8922, "lov": 8923, "thon</w>": 8924, "voters</w>": 8925, "affordable</w>": 8926, "defeated</w>": 8927, "lmfa": 8928, "airlines</w>": 8929, "superb</w>": 8930, "anyway</w>": 8931, "debt</w>": 8932, "bored</w>": 8933, "versi": 8934, "metal": 8935, "responsible</w>": 8936, "mk</w>": 8937, "sse</w>": 8938, "fay": 8939, "caused</w>": 8940, "fp</w>": 8941, "recommend</w>": 8942, "plaza</w>": 8943, "sporting</w>": 8944, "alliance</w>": 8945, "austri": 8946, "nn": 8947, "tours</w>": 8948, "surprised</w>": 8949, "artif": 8950, "thunder</w>": 8951, "surve": 8952, "wore</w>": 8953, "brief</w>": 8954, "necessary</w>": 8955, "zie</w>": 8956, "ashley</w>": 8957, "drake</w>": 8958, "rt": 8959, "knife</w>": 8960, "immun": 8961, "charges</w>": 8962, "athe": 8963, "bride</w>": 8964, "reply</w>": 8965, "gav": 8966, "broadcast</w>": 8967, "puer": 8968, "bracelet</w>": 8969, "capacity</w>": 8970, "harvest</w>": 8971, "idk</w>": 8972, "performan": 8973, "dding</w>": 8974, "ilers</w>": 8975, "para</w>": 8976, "jama": 8977, "province</w>": 8978, "chin</w>": 8979, "iders</w>": 8980, "hari</w>": 8981, "teaser</w>": 8982, "chen</w>": 8983, "restor": 8984, "rat</w>": 8985, "flat": 8986, "colom": 8987, "ðŁĴŀ</w>": 8988, "ðŁĩ¨ðŁĩ": 8989, "smooth</w>": 8990, "rt</w>": 8991, "pitch": 8992, "staying</w>": 8993, "israeli</w>": 8994, "tcot</w>": 8995, "perspective</w>": 8996, "dock</w>": 8997, "opener</w>": 8998, "lovel": 8999, "xo</w>": 9000, "classroom</w>": 9001, "lington</w>": 9002, "goal": 9003, "kennedy</w>": 9004, "sham</w>": 9005, "spaces</w>": 9006, "mitchell</w>": 9007, "homecoming</w>": 9008, "uki</w>": 9009, "claimed</w>": 9010, "recruit": 9011, "ingo</w>": 9012, "mufc</w>": 9013, "monit": 9014, "groo": 9015, "resident</w>": 9016, "percent</w>": 9017, "perman": 9018, "ottawa</w>": 9019, "intment</w>": 9020, "anxi": 9021, "standards</w>": 9022, "worship</w>": 9023, "scheme</w>": 9024, "fx</w>": 9025, "potter</w>": 9026, "bian</w>": 9027, "athletic</w>": 9028, "afgh": 9029, "sse": 9030, "satell": 9031, "parties</w>": 9032, "âĿ¤âĿ¤": 9033, "infrastructure</w>": 9034, "relax</w>": 9035, "modu": 9036, "worn</w>": 9037, "smoking</w>": 9038, "yach": 9039, "practices</w>": 9040, "wcw</w>": 9041, "amb": 9042, "domestic</w>": 9043, "taylor": 9044, "kentu": 9045, "provided</w>": 9046, "modi": 9047, "veg": 9048, "\"...</w>": 9049, "observ": 9050, "ðŁĺ©": 9051, "beard</w>": 9052, "mour": 9053, "angry</w>": 9054, "ðŁĺ±</w>": 9055, "startups</w>": 9056, "wooden</w>": 9057, "dive</w>": 9058, "nail</w>": 9059, "antique</w>": 9060, "roses</w>": 9061, "tornado</w>": 9062, "mat</w>": 9063, "^^</w>": 9064, "suspect</w>": 9065, "farm": 9066, "devices</w>": 9067, "mega</w>": 9068, "tul": 9069, "scholarship</w>": 9070, "gee</w>": 9071, "disaster</w>": 9072, "arrival</w>": 9073, "poin": 9074, "marc</w>": 9075, "katie</w>": 9076, "bbed</w>": 9077, "false</w>": 9078, "deserves</w>": 9079, "richard": 9080, "juana</w>": 9081, "frey</w>": 9082, "tioned</w>": 9083, "hybri": 9084, "rw": 9085, "sarah": 9086, "achi</w>": 9087, "cure</w>": 9088, "ole": 9089, "morris</w>": 9090, "chic</w>": 9091, "broadway</w>": 9092, "label</w>": 9093, "pak</w>": 9094, "poverty</w>": 9095, "golf": 9096, "ered</w>": 9097, "fu</w>": 9098, "eries</w>": 9099, "bees</w>": 9100, "alogue</w>": 9101, "stel": 9102, "wireless</w>": 9103, "jewish</w>": 9104, "tide</w>": 9105, "blocked</w>": 9106, "lifetime</w>": 9107, "bhar": 9108, "split</w>": 9109, "amster": 9110, "thi</w>": 9111, "joshu": 9112, "brunch</w>": 9113, "haps</w>": 9114, "sfor": 9115, "oops</w>": 9116, "kapoor</w>": 9117, "hiking</w>": 9118, "supposed</w>": 9119, "roof": 9120, "reas": 9121, "train": 9122, "tight</w>": 9123, "trump": 9124, "basically</w>": 9125, "rr</w>": 9126, "eared</w>": 9127, "seeds</w>": 9128, "entrance</w>": 9129, "cp</w>": 9130, "wie</w>": 9131, "sonic</w>": 9132, "victim</w>": 9133, "here": 9134, "eh</w>": 9135, "earrings</w>": 9136, "salmon</w>": 9137, "arctic</w>": 9138, "anne": 9139, "dougla": 9140, "corruption</w>": 9141, "hannah</w>": 9142, "hasn</w>": 9143, "voices</w>": 9144, "conce": 9145, "atta</w>": 9146, "fleet</w>": 9147, "clinical</w>": 9148, "democratic</w>": 9149, "tony": 9150, "stood</w>": 9151, "lef": 9152, "twitch</w>": 9153, "ail</w>": 9154, "honestly</w>": 9155, "increased</w>": 9156, "drome</w>": 9157, "donna</w>": 9158, "accepted</w>": 9159, "visitors</w>": 9160, "apar": 9161, "ador</w>": 9162, "par</w>": 9163, "jerry</w>": 9164, "rai": 9165, "brandon</w>": 9166, "abu": 9167, "!!!!!!</w>": 9168, "meme</w>": 9169, "ingh": 9170, "glorious</w>": 9171, "bhu": 9172, "pump</w>": 9173, "jol": 9174, "like": 9175, "fisher</w>": 9176, "maz": 9177, "agan</w>": 9178, "destination</w>": 9179, "playlist</w>": 9180, "letters</w>": 9181, "genu": 9182, "brace</w>": 9183, "celebrated</w>": 9184, "banner</w>": 9185, "rhe": 9186, "dragon": 9187, "ðŁĺħ</w>": 9188, "signature</w>": 9189, "grey": 9190, "âľĶï¸ı</w>": 9191, "alice</w>": 9192, "bered</w>": 9193, "pher": 9194, "bern": 9195, "cath": 9196, "gathering</w>": 9197, "scoring</w>": 9198, "influence</w>": 9199, "smiling</w>": 9200, "dept</w>": 9201, "local": 9202, "ax</w>": 9203, "acu": 9204, "retirement</w>": 9205, "honor": 9206, "herself</w>": 9207, "chemical</w>": 9208, "assess": 9209, "yall</w>": 9210, "frequ": 9211, "appreciation</w>": 9212, "aca</w>": 9213, "choir</w>": 9214, "cuz</w>": 9215, "soil</w>": 9216, "cil": 9217, "reporting</w>": 9218, "uh</w>": 9219, "enterprise</w>": 9220, "grat": 9221, "jacob</w>": 9222, "rum": 9223, "fee</w>": 9224, "jak": 9225, "spin</w>": 9226, "bikes</w>": 9227, "phia</w>": 9228, "stere": 9229, "pis": 9230, "blood": 9231, "tatt": 9232, "raft</w>": 9233, "warren</w>": 9234, "sheri": 9235, "backstage</w>": 9236, "marsh": 9237, "hashtag</w>": 9238, "therine</w>": 9239, "rein": 9240, "gameday</w>": 9241, "guaran": 9242, "recipes</w>": 9243, "minds</w>": 9244, "stronger</w>": 9245, "issued</w>": 9246, "bicy": 9247, "nak": 9248, "mented</w>": 9249, "scary</w>": 9250, "ux": 9251, "previous</w>": 9252, "ttle</w>": 9253, "thats</w>": 9254, "actors</w>": 9255, "uma</w>": 9256, "tina</w>": 9257, "bunny</w>": 9258, "promotion</w>": 9259, "uss</w>": 9260, "oliver</w>": 9261, "montreal</w>": 9262, "whats": 9263, "appreciated</w>": 9264, "lakes</w>": 9265, "excuse</w>": 9266, "knowing</w>": 9267, "prizes</w>": 9268, "muscle</w>": 9269, "shades</w>": 9270, "scot</w>": 9271, "ingredi": 9272, "electronic</w>": 9273, "juan</w>": 9274, "combat</w>": 9275, "sri</w>": 9276, "eh": 9277, "turkish</w>": 9278, "lom": 9279, "strikes</w>": 9280, "prison": 9281, "ree": 9282, "pope</w>": 9283, "vid</w>": 9284, "oldest</w>": 9285, "doll</w>": 9286, "swiss</w>": 9287, "certified</w>": 9288, "clip</w>": 9289, "returning</w>": 9290, "lator</w>": 9291, "leigh</w>": 9292, "ttes</w>": 9293, "watson</w>": 9294, "healing</w>": 9295, "elim": 9296, "perhaps</w>": 9297, "hass": 9298, "kau": 9299, "dder</w>": 9300, "mouse</w>": 9301, "newcastle</w>": 9302, "indigenous</w>": 9303, "welcomes</w>": 9304, "cole": 9305, "taught</w>": 9306, "noise</w>": 9307, "appear</w>": 9308, "joe": 9309, "canon</w>": 9310, "wednesday": 9311, "utah</w>": 9312, "ctive</w>": 9313, "driven</w>": 9314, "iv</w>": 9315, "cell": 9316, "strip</w>": 9317, "acc</w>": 9318, "focused</w>": 9319, "arrest</w>": 9320, "stocks</w>": 9321, "woo</w>": 9322, "âĹ": 9323, "noticed</w>": 9324, "shado": 9325, "displa": 9326, "terror</w>": 9327, "borne</w>": 9328, "second": 9329, "queens</w>": 9330, "woke</w>": 9331, "jail</w>": 9332, "nott": 9333, "cambridge</w>": 9334, "hart": 9335, "seaf": 9336, "fax</w>": 9337, "accept</w>": 9338, "âĺħ": 9339, "goods</w>": 9340, "kat</w>": 9341, "twin": 9342, "hs": 9343, "thousand</w>": 9344, "sins</w>": 9345, "suite</w>": 9346, "ampton</w>": 9347, "arn": 9348, "relev": 9349, "richar": 9350, "hoops</w>": 9351, "nbc": 9352, "classic": 9353, "pab": 9354, "soldier</w>": 9355, "deplo": 9356, "leans</w>": 9357, "installation</w>": 9358, "clash</w>": 9359, "leban": 9360, "eee</w>": 9361, "tire</w>": 9362, "beloved</w>": 9363, "fusion</w>": 9364, "traveling</w>": 9365, "nei": 9366, "cookie</w>": 9367, "globe</w>": 9368, "physics</w>": 9369, "sq": 9370, "col</w>": 9371, "wolves</w>": 9372, "dl</w>": 9373, "exit</w>": 9374, "\"-</w>": 9375, "football": 9376, "leaf": 9377, "sterling</w>": 9378, "hide</w>": 9379, "minneso": 9380, "freshman</w>": 9381, "nature": 9382, "indie</w>": 9383, "supplies</w>": 9384, "bris": 9385, "irish": 9386, "inktober</w>": 9387, "doodle</w>": 9388, "icop": 9389, "messages</w>": 9390, "adults</w>": 9391, "recorded</w>": 9392, "fixed</w>": 9393, "ardo</w>": 9394, "offered</w>": 9395, "underground</w>": 9396, "drone</w>": 9397, "pine": 9398, "mainten": 9399, "andre</w>": 9400, "hammer</w>": 9401, "sx": 9402, "round": 9403, "hike</w>": 9404, "brad</w>": 9405, "rome": 9406, "full": 9407, "oney</w>": 9408, "rows</w>": 9409, "columbia</w>": 9410, "archives</w>": 9411, "approved</w>": 9412, "batch</w>": 9413, "illinois</w>": 9414, "recognition</w>": 9415, "shouldn</w>": 9416, "fog</w>": 9417, "ncaa</w>": 9418, "kevin": 9419, "humanity</w>": 9420, "although</w>": 9421, "powers</w>": 9422, "pou": 9423, "sar</w>": 9424, "pest</w>": 9425, "alcohol</w>": 9426, "consci": 9427, "philadel": 9428, "eno": 9429, "tm": 9430, "okla": 9431, "category</w>": 9432, "participate</w>": 9433, "accused</w>": 9434, "brief": 9435, "poem</w>": 9436, "clubs</w>": 9437, "consult": 9438, "jab</w>": 9439, "bigdata</w>": 9440, "amsterdam</w>": 9441, "acing</w>": 9442, "certific": 9443, "nu</w>": 9444, "dat</w>": 9445, "improved</w>": 9446, "andy": 9447, "campaig": 9448, "palestin": 9449, "pace</w>": 9450, "mobi": 9451, "feelings</w>": 9452, "wolf": 9453, "brain": 9454, "propos": 9455, "interactive</w>": 9456, "prince": 9457, "index</w>": 9458, "cis": 9459, "chae": 9460, "peaceful</w>": 9461, "covering</w>": 9462, "aco": 9463, "courses</w>": 9464, "monkey</w>": 9465, "replace</w>": 9466, "bl</w>": 9467, "bloody</w>": 9468, "tales</w>": 9469, "brighton</w>": 9470, "neighborhood</w>": 9471, "gates</w>": 9472, "spiritual</w>": 9473, "afraid</w>": 9474, "breast</w>": 9475, "bones</w>": 9476, "ðŁĳī": 9477, "video": 9478, "wau": 9479, "touch": 9480, "injuries</w>": 9481, "carl</w>": 9482, "rix</w>": 9483, "unex": 9484, "âĢ¢": 9485, "fred": 9486, "considered</w>": 9487, "thusi": 9488, "anch": 9489, "ony": 9490, "usa": 9491, "graphics</w>": 9492, "acre</w>": 9493, "ðŁĺ©</w>": 9494, "commemor": 9495, "commod": 9496, "goti": 9497, "guardian</w>": 9498, "starbucks</w>": 9499, "prevention</w>": 9500, "hahahaha</w>": 9501, "administration</w>": 9502, "portugal</w>": 9503, "faculty</w>": 9504, "beta</w>": 9505, "ula</w>": 9506, "albert</w>": 9507, "breath": 9508, "eri</w>": 9509, "letting</w>": 9510, "tric": 9511, "mentation</w>": 9512, "incredibly</w>": 9513, "tennes": 9514, "vd</w>": 9515, "ðŁĻĪ</w>": 9516, "eddie</w>": 9517, "brick</w>": 9518, "grill</w>": 9519, "btw</w>": 9520, "watches</w>": 9521, "researchers</w>": 9522, "tney</w>": 9523, "nie": 9524, "pas</w>": 9525, "aster</w>": 9526, "vibr": 9527, "pokemon</w>": 9528, "chrome</w>": 9529, "goat</w>": 9530, "pitts": 9531, "illy</w>": 9532, "festive</w>": 9533, "yd</w>": 9534, "canal</w>": 9535, "ðŁĨ": 9536, "fies</w>": 9537, "carlos</w>": 9538, "reque": 9539, "partici": 9540, "trains</w>": 9541, "sample</w>": 9542, "temperature</w>": 9543, "symph": 9544, "picking</w>": 9545, "indoor</w>": 9546, "zers</w>": 9547, "playoffs</w>": 9548, "________": 9549, "apes</w>": 9550, "lyrics</w>": 9551, "islamic</w>": 9552, "performances</w>": 9553, "dick</w>": 9554, "spark": 9555, "seas</w>": 9556, "homa</w>": 9557, "ground": 9558, "disci": 9559, "employee</w>": 9560, "commu": 9561, "alaska</w>": 9562, "alan": 9563, "feast</w>": 9564, "dging</w>": 9565, "banking</w>": 9566, "manuel</w>": 9567, "slowly</w>": 9568, "trucks</w>": 9569, "mccar": 9570, "ooo</w>": 9571, "scrat": 9572, "orchestra</w>": 9573, "individu": 9574, "mx</w>": 9575, "breath</w>": 9576, "stairs</w>": 9577, "equality</w>": 9578, "blake</w>": 9579, "locations</w>": 9580, "coconut</w>": 9581, "baltimore</w>": 9582, "aaa</w>": 9583, "lc": 9584, "ðŁıĨ": 9585, "harvey</w>": 9586, "resist</w>": 9587, "immigration</w>": 9588, "adidas</w>": 9589, "fili": 9590, "ref</w>": 9591, "lgbt</w>": 9592, "mos</w>": 9593, "ppi</w>": 9594, "kenny</w>": 9595, "terror": 9596, "bane</w>": 9597, "apolis</w>": 9598, "sg": 9599, "socialmedia</w>": 9600, "kai</w>": 9601, "honest</w>": 9602, "assas": 9603, "bollywood</w>": 9604, "âĢįâĻĢï¸ı</w>": 9605, "ferrari</w>": 9606, "horn</w>": 9607, "crypto</w>": 9608, "boom": 9609, "maintenance</w>": 9610, "idi": 9611, "sman</w>": 9612, "wl</w>": 9613, "extended</w>": 9614, "insul": 9615, "ves": 9616, "gosp": 9617, "tri</w>": 9618, "pig</w>": 9619, "targe": 9620, "celer": 9621, "stati": 9622, "smh</w>": 9623, "ridic": 9624, "appeal</w>": 9625, "?)</w>": 9626, "conclu": 9627, "cosme": 9628, "sheep</w>": 9629, "christopher</w>": 9630, "enthusi": 9631, "polish</w>": 9632, "mets</w>": 9633, "ounded</w>": 9634, "sustainability</w>": 9635, "creativity</w>": 9636, "concrete</w>": 9637, "rai</w>": 9638, "alien</w>": 9639, "bless": 9640, "tees</w>": 9641, "club": 9642, "rot</w>": 9643, "bos</w>": 9644, "exist</w>": 9645, "perfection</w>": 9646, "luck": 9647, "rocky</w>": 9648, "expensive</w>": 9649, "meanwhile</w>": 9650, "happybirthday": 9651, "pret": 9652, "thriller</w>": 9653, "cave</w>": 9654, "playoff</w>": 9655, "somer": 9656, "lu</w>": 9657, "lex</w>": 9658, "defence</w>": 9659, "amwriting</w>": 9660, "homeless</w>": 9661, "prophe": 9662, "chet</w>": 9663, "pastor</w>": 9664, "ðŁ¤£": 9665, "lander</w>": 9666, "www</w>": 9667, "Ģï¸ı": 9668, "tica</w>": 9669, "!#</w>": 9670, "otic</w>": 9671, "radar</w>": 9672, "posters</w>": 9673, "powder</w>": 9674, "poli": 9675, "haun": 9676, "trap</w>": 9677, "blin": 9678, "assault</w>": 9679, "shorts</w>": 9680, "rey": 9681, "shy</w>": 9682, "squir": 9683, "racist</w>": 9684, "garlic</w>": 9685, "fur</w>": 9686, "remote</w>": 9687, "smell</w>": 9688, "impressed</w>": 9689, "fingers</w>": 9690, "âłĢ": 9691, "dino": 9692, "lement</w>": 9693, "snu": 9694, "promoting</w>": 9695, "string</w>": 9696, "productive</w>": 9697, "bage</w>": 9698, "mason</w>": 9699, "raz": 9700, "directly</w>": 9701, "jk</w>": 9702, "eval</w>": 9703, "ðŁĳĬ</w>": 9704, "doctors</w>": 9705, "cow</w>": 9706, "rider</w>": 9707, "stv</w>": 9708, "remove</w>": 9709, "wu": 9710, "nathan</w>": 9711, "rod": 9712, "nr</w>": 9713, "=></w>": 9714, "affected</w>": 9715, "invest</w>": 9716, "mption</w>": 9717, "ginger</w>": 9718, "od</w>": 9719, "agriculture</w>": 9720, "sque": 9721, "mug</w>": 9722, "counting</w>": 9723, "kee": 9724, "magnific": 9725, "cook": 9726, "anistan</w>": 9727, "root</w>": 9728, "placed</w>": 9729, "sympo": 9730, "ghana</w>": 9731, "und</w>": 9732, "cheer": 9733, "throwing</w>": 9734, "secrets</w>": 9735, "filling</w>": 9736, "optimi": 9737, "butterfly</w>": 9738, "bubb": 9739, "ðŁĺī": 9740, "terrible</w>": 9741, "dg</w>": 9742, "silk</w>": 9743, "obsessed</w>": 9744, "lou</w>": 9745, "aide</w>": 9746, "salute</w>": 9747, "monu": 9748, "philadelphia</w>": 9749, "scientific</w>": 9750, "ist": 9751, "uae</w>": 9752, "dessert</w>": 9753, "bottles</w>": 9754, "canyon</w>": 9755, "ðŁĺĪ</w>": 9756, "carib": 9757, "other": 9758, "wich": 9759, "resource</w>": 9760, "guilty</w>": 9761, "und": 9762, "leon</w>": 9763, "ess</w>": 9764, "kane</w>": 9765, "ele</w>": 9766, "trainer</w>": 9767, "heim</w>": 9768, "ante</w>": 9769, "manage</w>": 9770, "rookie</w>": 9771, "treated</w>": 9772, "poses</w>": 9773, "rsvp</w>": 9774, "causes</w>": 9775, "awak": 9776, "jewell": 9777, "lett</w>": 9778, "onics</w>": 9779, "titles</w>": 9780, "cardiff</w>": 9781, "gaga</w>": 9782, "bump": 9783, "useful</w>": 9784, "?!": 9785, "loose</w>": 9786, "bbing</w>": 9787, "::</w>": 9788, "argentina</w>": 9789, "debu": 9790, "cycl": 9791, "whel": 9792, "disgu": 9793, "jel": 9794, "kills</w>": 9795, "biology</w>": 9796, "exter": 9797, "trash</w>": 9798, "bodies</w>": 9799, "tram": 9800, "circuit</w>": 9801, "expect": 9802, "lads</w>": 9803, "wells</w>": 9804, "shot": 9805, "gee": 9806, "narendr": 9807, "fastest</w>": 9808, "bent": 9809, "bills</w>": 9810, "marshall</w>": 9811, "hats</w>": 9812, "introduce</w>": 9813, "citizen</w>": 9814, "impossible</w>": 9815, "gib": 9816, "azz": 9817, "networking</w>": 9818, "rant</w>": 9819, "think": 9820, "indy": 9821, "stops</w>": 9822, "ftheday</w>": 9823, "brian": 9824, "**": 9825, "amodi</w>": 9826, "dome</w>": 9827, "courage</w>": 9828, "packing</w>": 9829, "affairs</w>": 9830, "gn</w>": 9831, "sized</w>": 9832, "entary</w>": 9833, "poland</w>": 9834, "switzer": 9835, "afghanistan</w>": 9836, "wu</w>": 9837, "tender</w>": 9838, "subscribe</w>": 9839, "mosco": 9840, "attend": 9841, "republican</w>": 9842, "honey": 9843, "âĢĭ</w>": 9844, "simul": 9845, "wester": 9846, "foodie</w>": 9847, "oro": 9848, "middle": 9849, "abt</w>": 9850, "copies</w>": 9851, "maje": 9852, "narendramodi</w>": 9853, "typical</w>": 9854, "inspirational</w>": 9855, "vitam": 9856, "wiscon": 9857, "cubs</w>": 9858, "tivity</w>": 9859, "hali": 9860, "ears</w>": 9861, "kay</w>": 9862, "dare</w>": 9863, "marijuana</w>": 9864, "curious</w>": 9865, "ania</w>": 9866, "tomato</w>": 9867, "remind</w>": 9868, "ðŁĩ·</w>": 9869, "scared</w>": 9870, "coup": 9871, "poet</w>": 9872, "landed</w>": 9873, "rid</w>": 9874, "wrapped</w>": 9875, "morri": 9876, "climbing</w>": 9877, "ews</w>": 9878, "feeding</w>": 9879, "contra": 9880, "thology</w>": 9881, "grid</w>": 9882, "tively</w>": 9883, "reader</w>": 9884, "laser</w>": 9885, "diving</w>": 9886, "dig</w>": 9887, "latin</w>": 9888, "tied</w>": 9889, "shakespe": 9890, "oci": 9891, "adm": 9892, "showers</w>": 9893, "chuck</w>": 9894, "marcus</w>": 9895, "oos</w>": 9896, "knee</w>": 9897, "olive</w>": 9898, "owl</w>": 9899, "dylan</w>": 9900, "anno": 9901, "gym": 9902, "decisions</w>": 9903, "wellness</w>": 9904, "arrives</w>": 9905, "satis": 9906, "chris": 9907, "thurs</w>": 9908, "ðŁ¤£</w>": 9909, "interviews</w>": 9910, "thankyou</w>": 9911, "switzerland</w>": 9912, "overnight</w>": 9913, "journalist</w>": 9914, "serves</w>": 9915, "volcan": 9916, ".......</w>": 9917, "plot</w>": 9918, "nicol": 9919, "carrying</w>": 9920, "magne": 9921, "treasure</w>": 9922, "exp": 9923, "bever": 9924, "ðŁĺ¢</w>": 9925, "marty": 9926, "mole": 9927, "donations</w>": 9928, "recognized</w>": 9929, "bh": 9930, "dus</w>": 9931, "shann": 9932, "aldo</w>": 9933, "successfully</w>": 9934, "ente</w>": 9935, "ðŁĺĤðŁĺĤðŁĺĤðŁĺĤ": 9936, "cabinet</w>": 9937, "cuis": 9938, "titled</w>": 9939, "das": 9940, "sol</w>": 9941, "strategies</w>": 9942, "delivering</w>": 9943, "adds</w>": 9944, "anian</w>": 9945, "nether": 9946, "ðŁĴĥ": 9947, "contain": 9948, "suits</w>": 9949, "pairs</w>": 9950, "todd</w>": 9951, "rella</w>": 9952, "rope</w>": 9953, "cio</w>": 9954, "crop</w>": 9955, "paintings</w>": 9956, "suz": 9957, "rejec": 9958, "bust</w>": 9959, "dh</w>": 9960, "fraud</w>": 9961, "mh": 9962, "control": 9963, "jeal": 9964, "destroyed</w>": 9965, "allows</w>": 9966, "wool": 9967, "minnesota</w>": 9968, "omen": 9969, "ju</w>": 9970, "symposium</w>": 9971, "daf": 9972, "limit</w>": 9973, "accounts</w>": 9974, "loading</w>": 9975, "intern": 9976, "resolution</w>": 9977, "holland</w>": 9978, "qual": 9979, "meetings</w>": 9980, "grave</w>": 9981, "camping</w>": 9982, "vam": 9983, "renov": 9984, "liberal</w>": 9985, "amber</w>": 9986, "gree": 9987, "humb": 9988, "fever</w>": 9989, "eling</w>": 9990, "brooks</w>": 9991, "à²": 9992, "beth": 9993, "aded</w>": 9994, "alt": 9995, "roe</w>": 9996, "performed</w>": 9997, "josh": 9998, "franklin</w>": 9999, "nicole</w>": 10000, "dess</w>": 10001, "bbs</w>": 10002, "mg": 10003, "networks</w>": 10004, "minim": 10005, "alt</w>": 10006, "weapons</w>": 10007, "guy": 10008, "jason": 10009, "gha": 10010, "harbour</w>": 10011, "aton</w>": 10012, "praise</w>": 10013, "kentucky</w>": 10014, "belfast</w>": 10015, "sticks</w>": 10016, "bloss": 10017, "hopes</w>": 10018, "anthro": 10019, "familiar</w>": 10020, "wait": 10021, "chile</w>": 10022, "depression</w>": 10023, "lax</w>": 10024, "jets</w>": 10025, "leice": 10026, "receives</w>": 10027, "sier": 10028, "ank</w>": 10029, "dex": 10030, "indeed</w>": 10031, "flexi": 10032, "fabric</w>": 10033, "lamb</w>": 10034, "helicop": 10035, "amanda</w>": 10036, "âĢĶâĢĶ": 10037, "compete</w>": 10038, "snack</w>": 10039, "technologies</w>": 10040, "syrian</w>": 10041, "moms</w>": 10042, "muham": 10043, "chosen</w>": 10044, "anat": 10045, "devon</w>": 10046, "sharks</w>": 10047, "ret</w>": 10048, "fundraiser</w>": 10049, "selfies</w>": 10050, "stations</w>": 10051, "communications</w>": 10052, "tennessee</w>": 10053, "tutor": 10054, "rot": 10055, "valuable</w>": 10056, "dynamic</w>": 10057, "nurse</w>": 10058, "ied</w>": 10059, "earthquake</w>": 10060, "deserved</w>": 10061, "ave": 10062, "sara</w>": 10063, "stretch</w>": 10064, "douglas</w>": 10065, "nepal</w>": 10066, "Ã§": 10067, "obviously</w>": 10068, "dame</w>": 10069, "rape</w>": 10070, "anybody</w>": 10071, "kw": 10072, "patrol</w>": 10073, "holders</w>": 10074, "hanna</w>": 10075, "infographic</w>": 10076, "eco</w>": 10077, "beating</w>": 10078, "stanley</w>": 10079, "boats</w>": 10080, "ribb": 10081, "ez": 10082, "witch</w>": 10083, "inva": 10084, "acid</w>": 10085, "boarding</w>": 10086, "-@</w>": 10087, "gil</w>": 10088, "dave": 10089, "careers</w>": 10090, "oppos": 10091, "lloy": 10092, "inter</w>": 10093, "dope</w>": 10094, "resu": 10095, "jagu": 10096, "shade</w>": 10097, "indy</w>": 10098, "onist</w>": 10099, "relations</w>": 10100, "agen": 10101, "able": 10102, "incident</w>": 10103, "meter</w>": 10104, "sharma</w>": 10105, "idr</w>": 10106, "prove</w>": 10107, "immediately</w>": 10108, "troops</w>": 10109, "aman</w>": 10110, "glow</w>": 10111, "gaza</w>": 10112, "blocks</w>": 10113, "personal": 10114, "chronic": 10115, "aller": 10116, "sid": 10117, "shr": 10118, "whatsapp</w>": 10119, "lucy</w>": 10120, "archae": 10121, "hou</w>": 10122, "journalism</w>": 10123, "ourselves</w>": 10124, "got": 10125, "themed</w>": 10126, "shaped</w>": 10127, "weak</w>": 10128, "casual</w>": 10129, "length</w>": 10130, "slam</w>": 10131, "abbey</w>": 10132, "ev</w>": 10133, "counter": 10134, "esta</w>": 10135, "recipi": 10136, "chapel</w>": 10137, "expansion</w>": 10138, "self": 10139, "suffering</w>": 10140, "spice</w>": 10141, "nz": 10142, "spart": 10143, "desper": 10144, "booking</w>": 10145, "quarters</w>": 10146, "yon": 10147, "ðŁĴĹ": 10148, "pk": 10149, "continued</w>": 10150, "-#</w>": 10151, "manhatt": 10152, "talked</w>": 10153, "shen": 10154, "combo</w>": 10155, "hybrid</w>": 10156, "jeans</w>": 10157, "liquid</w>": 10158, "seal</w>": 10159, "retweets</w>": 10160, "acceler": 10161, "collective</w>": 10162, "tas</w>": 10163, ":))</w>": 10164, "professionals</w>": 10165, "raw": 10166, "ott": 10167, "susan</w>": 10168, "iring</w>": 10169, "oklahoma</w>": 10170, "reven": 10171, "survival</w>": 10172, "creator</w>": 10173, "transit</w>": 10174, "stac": 10175, "surf": 10176, "ik</w>": 10177, "editing</w>": 10178, "chilling</w>": 10179, "bailey</w>": 10180, "steal</w>": 10181, "rable</w>": 10182, "parent</w>": 10183, "hunger</w>": 10184, "snapp": 10185, "collect</w>": 10186, "philosoph": 10187, "dedication</w>": 10188, "cf": 10189, "cm": 10190, "leep</w>": 10191, "repeat</w>": 10192, "reha": 10193, "unfortun": 10194, "aer": 10195, "aero": 10196, "abstract</w>": 10197, "monitor</w>": 10198, "agents</w>": 10199, "bul</w>": 10200, "science": 10201, "harbor</w>": 10202, "dragons</w>": 10203, "flooding</w>": 10204, "accompli": 10205, "dash</w>": 10206, "julia</w>": 10207, "thered</w>": 10208, "tuesday": 10209, "cyber</w>": 10210, "blow</w>": 10211, "tained</w>": 10212, "lem": 10213, "reference</w>": 10214, "ppo</w>": 10215, "negoti": 10216, "charle": 10217, "connor</w>": 10218, "ault</w>": 10219, "accessories</w>": 10220, "commissioner</w>": 10221, "rainy</w>": 10222, "rear</w>": 10223, "advisory</w>": 10224, "lucas</w>": 10225, "maid</w>": 10226, "coal": 10227, "kav": 10228, "polo</w>": 10229, "ðŁı¾": 10230, "transport": 10231, "margare": 10232, "strawberry</w>": 10233, "burns</w>": 10234, "greens</w>": 10235, "nev": 10236, "participants</w>": 10237, "colin</w>": 10238, "belgium</w>": 10239, "colour": 10240, "inform": 10241, "dell</w>": 10242, "bron</w>": 10243, "caly": 10244, "kickoff</w>": 10245, "strategic</w>": 10246, "reunion</w>": 10247, "honors</w>": 10248, "lib": 10249, "egyp": 10250, "âŃĲï¸ı</w>": 10251, "hypo": 10252, "sizes</w>": 10253, "registered</w>": 10254, "betes</w>": 10255, "relaxing</w>": 10256, "bloom</w>": 10257, "intense</w>": 10258, "valentines</w>": 10259, "insane</w>": 10260, "wwii</w>": 10261, "px</w>": 10262, "trio</w>": 10263, "blade</w>": 10264, "wisconsin</w>": 10265, "cone</w>": 10266, "platin": 10267, "alize</w>": 10268, "raven": 10269, "increasing</w>": 10270, "indians</w>": 10271, "ilian</w>": 10272, "blu</w>": 10273, "rabbit</w>": 10274, "extension</w>": 10275, "jef": 10276, "audi</w>": 10277, "ferry</w>": 10278, "sell": 10279, "aday</w>": 10280, "usb</w>": 10281, "sweat": 10282, "champag": 10283, "method</w>": 10284, "memph": 10285, "assist</w>": 10286, "sby</w>": 10287, "cape": 10288, "removed</w>": 10289, "magn": 10290, "vt</w>": 10291, "rams</w>": 10292, "fbi</w>": 10293, "tackle</w>": 10294, "phew</w>": 10295, "hon</w>": 10296, "motorcycle</w>": 10297, "suspec": 10298, "elephant</w>": 10299, "subject</w>": 10300, "lette</w>": 10301, "dairy</w>": 10302, "wheat</w>": 10303, "awkward</w>": 10304, "act": 10305, "trol": 10306, "mitted</w>": 10307, "zayn</w>": 10308, "sheriff</w>": 10309, "enemy</w>": 10310, "cons</w>": 10311, "kett</w>": 10312, "bulls</w>": 10313, "evalu": 10314, "btc</w>": 10315, "satellite</w>": 10316, "holo": 10317, "porter</w>": 10318, "diabetes</w>": 10319, "better": 10320, "releasing</w>": 10321, "surf</w>": 10322, ":-</w>": 10323, "sebasti": 10324, "collecting</w>": 10325, "encing</w>": 10326, "ethi": 10327, "gods</w>": 10328, "alley</w>": 10329, "healthy": 10330, "mills</w>": 10331, "smash</w>": 10332, "copper</w>": 10333, "crack</w>": 10334, "readers</w>": 10335, "spac": 10336, "license</w>": 10337, "basket</w>": 10338, "bangla": 10339, "entic</w>": 10340, "omi</w>": 10341, "mere</w>": 10342, "sively</w>": 10343, "animation</w>": 10344, "lanes</w>": 10345, "dentally</w>": 10346, "chillin</w>": 10347, "fie</w>": 10348, "karen</w>": 10349, "depth</w>": 10350, "lipse</w>": 10351, "ng": 10352, "rip": 10353, "melo": 10354, "sandy</w>": 10355, "ðŁĳıðŁĳı": 10356, "vincent</w>": 10357, "nut": 10358, "hug</w>": 10359, "whole": 10360, "creates</w>": 10361, "????</w>": 10362, "âĿ¤ï¸ıâĿ¤ï¸ı</w>": 10363, "baked</w>": 10364, "upgrade</w>": 10365, "roberts</w>": 10366, "hara</w>": 10367, "caribbean</w>": 10368, "authentic</w>": 10369, "mbs</w>": 10370, "moscow</w>": 10371, "attorney</w>": 10372, "wiki": 10373, "chlo": 10374, "hull</w>": 10375, "cork</w>": 10376, "\"!</w>": 10377, "stylish</w>": 10378, "ðŁĵ¸:</w>": 10379, "diary</w>": 10380, "improving</w>": 10381, "expand": 10382, "bright": 10383, "pollution</w>": 10384, "knights</w>": 10385, "personality</w>": 10386, "checked</w>": 10387, "facilities</w>": 10388, "zel": 10389, "bowling</w>": 10390, "guer": 10391, "ðŁİĤ</w>": 10392, "ongoing</w>": 10393, "units</w>": 10394, "hook</w>": 10395, "beck</w>": 10396, "conflict</w>": 10397, "todd": 10398, "farming</w>": 10399, "educational</w>": 10400, "kak": 10401, "clay": 10402, "stroke</w>": 10403, "belly</w>": 10404, "explore": 10405, "millenni": 10406, "thm</w>": 10407, "loop</w>": 10408, "sms</w>": 10409, "consist": 10410, "circa</w>": 10411, "bryan</w>": 10412, "dab": 10413, "younger</w>": 10414, "solidar": 10415, "ppa</w>": 10416, "experienced</w>": 10417, "bella</w>": 10418, "board": 10419, "sheffield</w>": 10420, "stephen": 10421, "consumer</w>": 10422, "submit</w>": 10423, "sponsor": 10424, "tang": 10425, "aggre": 10426, "combined</w>": 10427, "tracking</w>": 10428, "sanders</w>": 10429, "baz": 10430, "survive</w>": 10431, "ferred</w>": 10432, "equal</w>": 10433, "sep</w>": 10434, "reed</w>": 10435, "strong": 10436, "privacy</w>": 10437, "stap": 10438, "ung": 10439, "acry": 10440, "pasta</w>": 10441, "pirates</w>": 10442, "ager</w>": 10443, "fairy</w>": 10444, "dup</w>": 10445, "introduced</w>": 10446, "wip</w>": 10447, "lets": 10448, "spray</w>": 10449, "ðŁĵº</w>": 10450, "grew</w>": 10451, "asts</w>": 10452, "pittsburgh</w>": 10453, "newyork</w>": 10454, "joey</w>": 10455, "lauren": 10456, "trade": 10457, "chop": 10458, "pipe</w>": 10459, "claire</w>": 10460, "behavior</w>": 10461, "vap": 10462, "crews</w>": 10463, "laptop</w>": 10464, "ðŁ¤Ĺ</w>": 10465, "chester": 10466, "discipl": 10467, "df</w>": 10468, "outdoors</w>": 10469, "ks": 10470, "gover": 10471, "superstar</w>": 10472, "casino</w>": 10473, "farmer</w>": 10474, ";-)</w>": 10475, "returned</w>": 10476, "ðŁıĪ</w>": 10477, "mail": 10478, "roasted</w>": 10479, "costa</w>": 10480, "vill": 10481, "pez</w>": 10482, "gardening</w>": 10483, "distribution</w>": 10484, "shining</w>": 10485, "investors</w>": 10486, "rasp": 10487, "decades</w>": 10488, "realized</w>": 10489, "barn": 10490, "pti</w>": 10491, "stable</w>": 10492, "utd</w>": 10493, "panthers</w>": 10494, "mens</w>": 10495, "bn": 10496, "cade": 10497, "bucket</w>": 10498, "ynn</w>": 10499, "whenever</w>": 10500, "wake": 10501, "dais": 10502, "bernie</w>": 10503, "lodge</w>": 10504, "julie</w>": 10505, "atmosphere</w>": 10506, "ðŁĺĺðŁĺĺ</w>": 10507, "majority</w>": 10508, "parti": 10509, "excit": 10510, "cut": 10511, "meh": 10512, "muslims</w>": 10513, "begun</w>": 10514, "flights</w>": 10515, "veness</w>": 10516, "ceme": 10517, "posing</w>": 10518, "sole": 10519, "gou": 10520, "darkness</w>": 10521, "peach": 10522, "celtic</w>": 10523, "authority</w>": 10524, "grandma</w>": 10525, "fulness</w>": 10526, "smith": 10527, "specific</w>": 10528, "garcia</w>": 10529, "coins</w>": 10530, "goodness</w>": 10531, "aldub": 10532, "recruiting</w>": 10533, "dennis</w>": 10534, "gary": 10535, "sleeve</w>": 10536, "weapon</w>": 10537, "plz</w>": 10538, "discover": 10539, "harrison</w>": 10540, "recruitment</w>": 10541, "jai": 10542, "chim": 10543, "compared</w>": 10544, "toms</w>": 10545, "mothers</w>": 10546, "amy": 10547, "archive</w>": 10548, "task</w>": 10549, "benjam": 10550, "seg": 10551, "lawyer</w>": 10552, "alum</w>": 10553, "investing</w>": 10554, "mie</w>": 10555, "chez</w>": 10556, "jp</w>": 10557, "ake": 10558, "flam": 10559, "wallpaper</w>": 10560, "âĻ¥ï¸ı</w>": 10561, "tton</w>": 10562, "chest</w>": 10563, "favorites</w>": 10564, "weigh": 10565, "coolest</w>": 10566, "rating</w>": 10567, "relevant</w>": 10568, "logan</w>": 10569, "maple</w>": 10570, "runners</w>": 10571, "prior</w>": 10572, "people": 10573, "maur": 10574, "terrorist</w>": 10575, "tested</w>": 10576, "carnival</w>": 10577, "suspen": 10578, "measure</w>": 10579, "mv": 10580, "cybersecurity</w>": 10581, "appren": 10582, "terrorism</w>": 10583, "oz": 10584, "vital</w>": 10585, "nies</w>": 10586, "gonz": 10587, "funded</w>": 10588, "twist</w>": 10589, "assessment</w>": 10590, "diesel</w>": 10591, "enfor": 10592, "column</w>": 10593, "addressing</w>": 10594, "casts</w>": 10595, "payment</w>": 10596, "xton</w>": 10597, "fier</w>": 10598, ",'</w>": 10599, "last": 10600, "nee</w>": 10601, "unless</w>": 10602, "close": 10603, "skill</w>": 10604, "cuisine</w>": 10605, "funeral</w>": 10606, "tiles</w>": 10607, "aun": 10608, "kru": 10609, "relationships</w>": 10610, "ðŁĴ¯": 10611, "event": 10612, "âĢįâĻĤï¸ı</w>": 10613, "kindness</w>": 10614, "proposed</w>": 10615, "acoustic</w>": 10616, "aes": 10617, "defender</w>": 10618, "dance": 10619, "htt": 10620, "wat</w>": 10621, "voy": 10622, "ðŁ¤ĺ": 10623, "aus": 10624, "cliff</w>": 10625, "searching</w>": 10626, "beautifully</w>": 10627, "inqu": 10628, "atl</w>": 10629, "specialist</w>": 10630, "ðŁĲ¶</w>": 10631, "dai</w>": 10632, "trails</w>": 10633, "classics</w>": 10634, "instant</w>": 10635, "vous</w>": 10636, "revenue</w>": 10637, "march": 10638, "kirk": 10639, "fringe</w>": 10640, "fireworks</w>": 10641, "trivia</w>": 10642, "âĺħ</w>": 10643, "traction</w>": 10644, "walter</w>": 10645, "moto": 10646, "lily</w>": 10647, "attitude</w>": 10648, "climb</w>": 10649, "scan": 10650, "savings</w>": 10651, "cw": 10652, "faith": 10653, "credits</w>": 10654, "abled</w>": 10655, "graff": 10656, "autograph": 10657, "hehe</w>": 10658, "ranch</w>": 10659, "had": 10660, "rogers</w>": 10661, "ðŁĮ¹</w>": 10662, "fin</w>": 10663, "requ": 10664, "folk": 10665, "additional</w>": 10666, "lynn</w>": 10667, "uber</w>": 10668, "dollars</w>": 10669, "logic</w>": 10670, "worth": 10671, "som</w>": 10672, "thesis</w>": 10673, "pound</w>": 10674, "bic</w>": 10675, "stur": 10676, "ceram": 10677, "spencer</w>": 10678, "entered</w>": 10679, "vamp": 10680, "organized</w>": 10681, "âľĪ": 10682, "pps</w>": 10683, "tron</w>": 10684, "mercedes</w>": 10685, "noti": 10686, "competitive</w>": 10687, "dow</w>": 10688, "ousness</w>": 10689, "victor</w>": 10690, "grilled</w>": 10691, "nai</w>": 10692, "putin</w>": 10693, "abra": 10694, "blame</w>": 10695, "alexand": 10696, "animal": 10697, "decent</w>": 10698, "pent": 10699, "interior": 10700, ":')</w>": 10701, "butler</w>": 10702, "ballet</w>": 10703, "ðŁĴĶ</w>": 10704, "albums</w>": 10705, "downs</w>": 10706, "lad</w>": 10707, "sir": 10708, "plain</w>": 10709, "pers</w>": 10710, "blonde</w>": 10711, "disc</w>": 10712, "pakistan": 10713, "sement</w>": 10714, "gaa</w>": 10715, "wage</w>": 10716, "chas": 10717, "mani</w>": 10718, "cops</w>": 10719, "territ": 10720, "lol": 10721, "laughter</w>": 10722, "rivers</w>": 10723, "magnificent</w>": 10724, "lamp</w>": 10725, "wb": 10726, "newsle": 10727, "charts</w>": 10728, "blessing</w>": 10729, "punch</w>": 10730, "longest</w>": 10731, "floral</w>": 10732, "cutie</w>": 10733, "farewell</w>": 10734, "stopping</w>": 10735, "mbb</w>": 10736, "bud</w>": 10737, "cheese": 10738, "decla": 10739, "sim</w>": 10740, "mcdonald</w>": 10741, "deter": 10742, "youth": 10743, "tch": 10744, "freder": 10745, "kindle</w>": 10746, "fern": 10747, "ator": 10748, "asleep</w>": 10749, "pond</w>": 10750, "sprint</w>": 10751, "pounds</w>": 10752, "lazy</w>": 10753, "ghe": 10754, "fundraising</w>": 10755, "deadly</w>": 10756, "grande</w>": 10757, "doug</w>": 10758, "hey": 10759, "linda</w>": 10760, "considering</w>": 10761, "ium</w>": 10762, "golden": 10763, "vik": 10764, "authors</w>": 10765, "diss": 10766, "ually</w>": 10767, "appropriate</w>": 10768, "morning": 10769, "yle</w>": 10770, "honoring</w>": 10771, "folio</w>": 10772, "bec</w>": 10773, "rebec": 10774, "finland</w>": 10775, "formula</w>": 10776, "cornwall</w>": 10777, "shay": 10778, "causing</w>": 10779, "blend</w>": 10780, "signal</w>": 10781, "tent</w>": 10782, "kashmir</w>": 10783, "nationals</w>": 10784, "harmony</w>": 10785, "scout</w>": 10786, "accessi": 10787, "height</w>": 10788, "medieval</w>": 10789, "improvement</w>": 10790, "kees</w>": 10791, "practical</w>": 10792, "card": 10793, "depar": 10794, "hun</w>": 10795, "oming</w>": 10796, "calgary</w>": 10797, "stel</w>": 10798, "bubble</w>": 10799, "guru</w>": 10800, "mah</w>": 10801, "unexpe": 10802, "nh</w>": 10803, "eda</w>": 10804, "meat": 10805, "ige</w>": 10806, "sio</w>": 10807, "goddess</w>": 10808, "inches</w>": 10809, "tunes</w>": 10810, "britt": 10811, "stion</w>": 10812, "raj</w>": 10813, "âĻ«</w>": 10814, "mercy</w>": 10815, "ðŁĴĺ</w>": 10816, "sends</w>": 10817, "iest</w>": 10818, "polici": 10819, "vale</w>": 10820, "reduced</w>": 10821, "asap</w>": 10822, "vijay</w>": 10823, "defensive</w>": 10824, "celebrations</w>": 10825, "riders</w>": 10826, "meditation</w>": 10827, "harmon": 10828, "ging": 10829, "Â¡</w>": 10830, "programming</w>": 10831, "inau": 10832, "sudden": 10833, "mh</w>": 10834, "replacement</w>": 10835, "sku": 10836, "jar</w>": 10837, "grades</w>": 10838, "tast": 10839, "kitt": 10840, "branding</w>": 10841, "kaw": 10842, "boot": 10843, "fought</w>": 10844, "pays</w>": 10845, "gf</w>": 10846, "ization</w>": 10847, "hop": 10848, "kk</w>": 10849, "activist</w>": 10850, "vend": 10851, "coastal</w>": 10852, "chaos</w>": 10853, "ðŁĶ´</w>": 10854, "seme": 10855, "billboard</w>": 10856, "lifting</w>": 10857, "cumb": 10858, "scal": 10859, "ðŁĸ¤</w>": 10860, "struck</w>": 10861, "lv": 10862, "indiedev</w>": 10863, "beaten</w>": 10864, "jungle</w>": 10865, "alright</w>": 10866, "destiny</w>": 10867, "ming": 10868, "kc": 10869, "chances</w>": 10870, "oman</w>": 10871, "qatar</w>": 10872, "craf": 10873, "trained</w>": 10874, "prix</w>": 10875, "charm</w>": 10876, "otive</w>": 10877, "smu": 10878, "ec</w>": 10879, "anders</w>": 10880, "handed</w>": 10881, "alban": 10882, "certainly</w>": 10883, "arriving</w>": 10884, "ize</w>": 10885, "sai</w>": 10886, "track": 10887, "painter</w>": 10888, "humble</w>": 10889, "appointment</w>": 10890, "headline</w>": 10891, "managing</w>": 10892, "mod</w>": 10893, "aspe": 10894, "andrea</w>": 10895, "Ã¤": 10896, "ethiop": 10897, "united": 10898, "exist": 10899, "bali</w>": 10900, "kad": 10901, "nt": 10902, "dred</w>": 10903, "rex</w>": 10904, "recognize</w>": 10905, "tampa</w>": 10906, "beers</w>": 10907, "atia</w>": 10908, "heels</w>": 10909, "note": 10910, "transportation</w>": 10911, "turtle</w>": 10912, "rede": 10913, "hiphop</w>": 10914, "spicy</w>": 10915, "spurs</w>": 10916, "â¬ĩ": 10917, "corp</w>": 10918, "thern": 10919, "toast</w>": 10920, "hurry</w>": 10921, "properties</w>": 10922, "mage</w>": 10923, "marco</w>": 10924, "elements</w>": 10925, "bouti": 10926, "syndrome</w>": 10927, "msg</w>": 10928, "developer</w>": 10929, "graders</w>": 10930, "heim": 10931, "resil": 10932, "offices</w>": 10933, "delay</w>": 10934, "dimen": 10935, "vintag": 10936, "barbara</w>": 10937, "ðŁĺ±": 10938, "venezu": 10939, "cular</w>": 10940, "faced</w>": 10941, "barn</w>": 10942, "ðŁĺĨ</w>": 10943, "survivor</w>": 10944, "worm</w>": 10945, "confused</w>": 10946, "passionate</w>": 10947, "Ø±": 10948, "identify</w>": 10949, "electricity</w>": 10950, "souls</w>": 10951, "bradley</w>": 10952, "reportedly</w>": 10953, "lunch": 10954, "shelf</w>": 10955, "elia</w>": 10956, "sweet": 10957, "smooth": 10958, "employment</w>": 10959, "amel</w>": 10960, "manhattan</w>": 10961, "steam": 10962, "ounts</w>": 10963, "yep</w>": 10964, "living": 10965, "une</w>": 10966, "describe</w>": 10967, "cares</w>": 10968, "manila</w>": 10969, "shawn</w>": 10970, "acted</w>": 10971, "bash</w>": 10972, "steven": 10973, "rest": 10974, "petition</w>": 10975, "divine</w>": 10976, "welsh</w>": 10977, "race": 10978, "platinum</w>": 10979, "ðŁĮ¸</w>": 10980, "pb</w>": 10981, "extraordinary</w>": 10982, "solidarity</w>": 10983, "mall": 10984, "onion</w>": 10985, "scheduled</w>": 10986, "gameof": 10987, "fergu": 10988, "dems</w>": 10989, "norm": 10990, "pk</w>": 10991, "trials</w>": 10992, "policies</w>": 10993, "publishing</w>": 10994, "stole</w>": 10995, "front": 10996, "character": 10997, "vania</w>": 10998, "exce": 10999, "stie</w>": 11000, "sca</w>": 11001, "residential</w>": 11002, "sailing</w>": 11003, "ðŁĶ¥ðŁĶ¥ðŁĶ¥</w>": 11004, "sponsors</w>": 11005, "thick</w>": 11006, "champagne</w>": 11007, "shepher": 11008, "continuing</w>": 11009, "venice</w>": 11010, "perth</w>": 11011, "nap</w>": 11012, "aster": 11013, "yak": 11014, "unlimited</w>": 11015, "choices</w>": 11016, "neo</w>": 11017, "hiv</w>": 11018, "reporter</w>": 11019, "brussels</w>": 11020, "fold</w>": 11021, "dys": 11022, "semi": 11023, "lawn</w>": 11024, "italia</w>": 11025, "wifi</w>": 11026, "ask": 11027, "emed</w>": 11028, "frame": 11029, "monitoring</w>": 11030, "stead</w>": 11031, "ida": 11032, "grin": 11033, "isa": 11034, "flip</w>": 11035, "restric": 11036, "offensive</w>": 11037, "attached</w>": 11038, "dish": 11039, "why": 11040, "phillips</w>": 11041, "greet": 11042, "pals</w>": 11043, "mixtape</w>": 11044, "vou": 11045, "fielder</w>": 11046, "spark</w>": 11047, "alberta</w>": 11048, "glen</w>": 11049, "cash": 11050, "sri": 11051, "uri": 11052, "rodri": 11053, "entrepreneurs</w>": 11054, "climatechange</w>": 11055, "psy</w>": 11056, "dle": 11057, "ements</w>": 11058, "linked</w>": 11059, "netherlands</w>": 11060, "accidentally</w>": 11061, "opposition</w>": 11062, "velvet</w>": 11063, "rays</w>": 11064, "cw</w>": 11065, "omo</w>": 11066, "mf</w>": 11067, "lmfao</w>": 11068, "newsletter</w>": 11069, ":)": 11070, "toilet</w>": 11071, "literature</w>": 11072, "disp": 11073, "philip</w>": 11074, "uniform</w>": 11075, "suddenly</w>": 11076, "header</w>": 11077, "cooler</w>": 11078, "---</w>": 11079, "proud": 11080, "brig": 11081, "nissan</w>": 11082, "scientist</w>": 11083, "jah</w>": 11084, "concentr": 11085, "packs</w>": 11086, "appointed</w>": 11087, "soap</w>": 11088, "engage</w>": 11089, "chose</w>": 11090, "âĻ¡": 11091, "setup</w>": 11092, "jealous</w>": 11093, "harry": 11094, "gation</w>": 11095, "tunnel</w>": 11096, "temp</w>": 11097, "oscars</w>": 11098, "decade</w>": 11099, "recommended</w>": 11100, "children": 11101, "aba</w>": 11102, "anxiety</w>": 11103, "vements</w>": 11104, "salon</w>": 11105, "photoo": 11106, "organiz": 11107, "machines</w>": 11108, "abs</w>": 11109, "ville": 11110, "hype</w>": 11111, "tiff": 11112, "emerging</w>": 11113, "avgeek</w>": 11114, "[#</w>": 11115, "contribution</w>": 11116, "brady</w>": 11117, "resto": 11118, "gmail</w>": 11119, "fitz": 11120, "photoshoot</w>": 11121, "helmet</w>": 11122, "ht": 11123, "elegant</w>": 11124, "uganda</w>": 11125, "nursing</w>": 11126, "orleans</w>": 11127, "penn</w>": 11128, "nah</w>": 11129, "footage</w>": 11130, "ema</w>": 11131, "wo</w>": 11132, "wad": 11133, "concerns</w>": 11134, "vere": 11135, "remark": 11136, "whoever</w>": 11137, "strang": 11138, "pt": 11139, "quit</w>": 11140, "shang": 11141, "history": 11142, "sick": 11143, "permanent</w>": 11144, "illness</w>": 11145, "cold": 11146, "vision": 11147, "hem</w>": 11148, "arrow</w>": 11149, "convic": 11150, "pink": 11151, "occup": 11152, "bald": 11153, "exhau": 11154, "uof": 11155, "amo</w>": 11156, "ont</w>": 11157, "ãĥ»</w>": 11158, "adopt</w>": 11159, "laid</w>": 11160, "smoked</w>": 11161, "interpre": 11162, "essenti": 11163, "associated</w>": 11164, "bd</w>": 11165, "bby": 11166, "fier": 11167, "install</w>": 11168, "diplom": 11169, "conditi": 11170, "cf</w>": 11171, "wak": 11172, "anya</w>": 11173, "graci": 11174, "fisher": 11175, "sss</w>": 11176, "apr</w>": 11177, "ilit": 11178, "musician</w>": 11179, "symphony</w>": 11180, "cord</w>": 11181, "hack</w>": 11182, "legi": 11183, "lv</w>": 11184, "blessings</w>": 11185, "humor</w>": 11186, "scra": 11187, "eti": 11188, "minster</w>": 11189, "travelling</w>": 11190, "bush": 11191, "jewellery</w>": 11192, "lime</w>": 11193, "!!!": 11194, "pregnant</w>": 11195, "pee</w>": 11196, "lob": 11197, "capital": 11198, "ipa</w>": 11199, "pencil</w>": 11200, "labor": 11201, "ducks</w>": 11202, "proudly</w>": 11203, "wedding": 11204, "derek</w>": 11205, "mw</w>": 11206, "peg</w>": 11207, "valentine": 11208, "angu": 11209, "retreat</w>": 11210, "prospect</w>": 11211, "danger</w>": 11212, "vulner": 11213, "upset</w>": 11214, ",#</w>": 11215, "srk</w>": 11216, "xim": 11217, "thursday": 11218, "nfl": 11219, "kisses</w>": 11220, "reds</w>": 11221, "crack": 11222, "reward</w>": 11223, "cu</w>": 11224, "kok</w>": 11225, "mete": 11226, "abandoned</w>": 11227, "itt</w>": 11228, "meals</w>": 11229, "spell</w>": 11230, "stanbul</w>": 11231, "delays</w>": 11232, "rum</w>": 11233, "leop": 11234, "gum</w>": 11235, "nova</w>": 11236, "superman</w>": 11237, "chick</w>": 11238, "mis</w>": 11239, "dramatic</w>": 11240, "innocent</w>": 11241, "rounds</w>": 11242, "rec</w>": 11243, "autism</w>": 11244, "bangladesh</w>": 11245, "moral</w>": 11246, "movie": 11247, "spoo": 11248, "kla": 11249, "âĥ£": 11250, "outing</w>": 11251, "messi</w>": 11252, "abroad</w>": 11253, "lookin</w>": 11254, "aim</w>": 11255, "qi</w>": 11256, "stack</w>": 11257, "collage</w>": 11258, "à¯": 11259, "hudson</w>": 11260, "scan</w>": 11261, "hoe</w>": 11262, "chau": 11263, "occur": 11264, "commander</w>": 11265, "holes</w>": 11266, "ðŁİĦ</w>": 11267, "bias</w>": 11268, "von": 11269, "sticker</w>": 11270, "mak": 11271, "responsibility</w>": 11272, "columbus</w>": 11273, "saint": 11274, "edmon": 11275, "racism</w>": 11276, "farms</w>": 11277, "wen</w>": 11278, "gulf</w>": 11279, "mayo</w>": 11280, "!!!!!!!!": 11281, "corporation</w>": 11282, "bachel": 11283, "ela": 11284, "internal</w>": 11285, "jeep</w>": 11286, "follows</w>": 11287, "dialogue</w>": 11288, "derer</w>": 11289, "smartphone</w>": 11290, "helen</w>": 11291, "richmond</w>": 11292, "equity</w>": 11293, "sland</w>": 11294, "bg</w>": 11295, "near": 11296, "avi</w>": 11297, "memphis</w>": 11298, "weir": 11299, "discussed</w>": 11300, "badge</w>": 11301, "pup</w>": 11302, "mistake</w>": 11303, "phenomen": 11304, "unite</w>": 11305, "ðŁĽ": 11306, "depic": 11307, "rides</w>": 11308, "inaugu": 11309, "nat</w>": 11310, "softwitter</w>": 11311, "combination</w>": 11312, "gospel</w>": 11313, "âļ¾": 11314, "admission</w>": 11315, "retrogaming</w>": 11316, "ðŁĲ¾</w>": 11317, "schu": 11318, "mbo</w>": 11319, "junction</w>": 11320, "alarm</w>": 11321, "à¦": 11322, "grac": 11323, "khali": 11324, "kul": 11325, "male": 11326, "caption</w>": 11327, "wish": 11328, "tere": 11329, "corps</w>": 11330, "rubber</w>": 11331, "playstation</w>": 11332, "erin</w>": 11333, "efficient</w>": 11334, "lor</w>": 11335, "jokes</w>": 11336, "inary</w>": 11337, "norman</w>": 11338, "luis</w>": 11339, "inaugural</w>": 11340, "ched": 11341, "âļ½ï¸ı": 11342, "dip</w>": 11343, "toe</w>": 11344, "strat": 11345, "aac</w>": 11346, "amu": 11347, "pier</w>": 11348, "cott</w>": 11349, "command</w>": 11350, "tten": 11351, "snoo": 11352, "cube</w>": 11353, "closes</w>": 11354, "classical</w>": 11355, "sword</w>": 11356, "expression</w>": 11357, "reaching</w>": 11358, "napp": 11359, "cost": 11360, "affect</w>": 11361, "rico</w>": 11362, "gif": 11363, "breathe</w>": 11364, "tribe</w>": 11365, "ortho": 11366, "hay</w>": 11367, "lg</w>": 11368, "fries</w>": 11369, "nm</w>": 11370, "hiding</w>": 11371, "richards</w>": 11372, "ende": 11373, "micro</w>": 11374, "capitol</w>": 11375, "copy": 11376, "rom": 11377, "regime</w>": 11378, "maryland</w>": 11379, "taxi</w>": 11380, "dial</w>": 11381, "embarra": 11382, "unbeliev": 11383, "cht</w>": 11384, "vs": 11385, "elimin": 11386, "odd</w>": 11387, "penny</w>": 11388, "soundtrack</w>": 11389, "lings</w>": 11390, "transition</w>": 11391, "remaining</w>": 11392, "ais</w>": 11393, "malik</w>": 11394, "?!?</w>": 11395, "random": 11396, "defend</w>": 11397, "ultra": 11398, "trum</w>": 11399, "dancer</w>": 11400, "stol": 11401, "drive": 11402, "aver</w>": 11403, "roast</w>": 11404, "definition</w>": 11405, "sean": 11406, "excitement</w>": 11407, "particul": 11408, "surely</w>": 11409, "shav": 11410, "bery</w>": 11411, "dishes</w>": 11412, "comm</w>": 11413, "isol": 11414, "iam</w>": 11415, "obli": 11416, "ghost": 11417, "hughes</w>": 11418, "chiefs</w>": 11419, "bas</w>": 11420, "conservative</w>": 11421, "special": 11422, "femin": 11423, "shri</w>": 11424, "nancy</w>": 11425, "intel</w>": 11426, "tune": 11427, "ðŁĩª": 11428, "joel</w>": 11429, "ggle</w>": 11430, "moto</w>": 11431, "ðŁĺĶ</w>": 11432, "buck</w>": 11433, "dag": 11434, "anticip": 11435, "montana</w>": 11436, "guid": 11437, "frog</w>": 11438, "ecraft</w>": 11439, "ope</w>": 11440, "drives</w>": 11441, "numer": 11442, "xy</w>": 11443, "colorful</w>": 11444, "wednesdaywisdom</w>": 11445, "illumin": 11446, "beyon": 11447, "inaugur": 11448, "deeply</w>": 11449, "prefer</w>": 11450, "fortune</w>": 11451, "cooked</w>": 11452, "tible</w>": 11453, "âĺķ": 11454, "sweater</w>": 11455, "itter</w>": 11456, "tty": 11457, "ui</w>": 11458, "gie": 11459, "complic": 11460, "~~": 11461, "taxes</w>": 11462, "cups</w>": 11463, "diverse</w>": 11464, "samanth": 11465, "âłĢâłĢ": 11466, "baking</w>": 11467, "symp": 11468, "wai": 11469, "behalf</w>": 11470, "mercur": 11471, "travels</w>": 11472, "ðŁİīðŁİ": 11473, "oria</w>": 11474, "engaged</w>": 11475, "jumping</w>": 11476, "retired</w>": 11477, "naked</w>": 11478, "puni": 11479, "speedway</w>": 11480, "sciences</w>": 11481, "rehearsal</w>": 11482, "onym": 11483, "dyou": 11484, "plates</w>": 11485, "rati": 11486, "krish": 11487, "jazz": 11488, "carol</w>": 11489, "raf</w>": 11490, "penalty</w>": 11491, "timeline</w>": 11492, "ruby</w>": 11493, "engineers</w>": 11494, "raf": 11495, "belle</w>": 11496, "dose</w>": 11497, "cheon</w>": 11498, "escap": 11499, "meg": 11500, "rank</w>": 11501, "ord</w>": 11502, "megan</w>": 11503, "merch</w>": 11504, "eclipse</w>": 11505, "âĺºï¸ı": 11506, "pledge</w>": 11507, "kirk</w>": 11508, "persi": 11509, "leicester</w>": 11510, "sak": 11511, "wk": 11512, "safely</w>": 11513, "yyy</w>": 11514, "jet": 11515, "promised</w>": 11516, "jc</w>": 11517, "enne</w>": 11518, "noah</w>": 11519, "reno": 11520, "rea</w>": 11521, "ðŁĺĤðŁĺĤðŁĺĤðŁĺĤ</w>": 11522, "trail": 11523, "ðŁĳĢ": 11524, "fd</w>": 11525, "sooo</w>": 11526, "rimin": 11527, "wk</w>": 11528, "à¸²": 11529, "ial": 11530, "xox": 11531, "biscu": 11532, "dale": 11533, "fandom</w>": 11534, "participating</w>": 11535, "flag": 11536, "privilege</w>": 11537, "peach</w>": 11538, "machine": 11539, "boston": 11540, "gross</w>": 11541, "og": 11542, "miracle</w>": 11543, "adoption</w>": 11544, "uss": 11545, "monsters</w>": 11546, "beij": 11547, "clarke</w>": 11548, "pushing</w>": 11549, "praying</w>": 11550, "aro</w>": 11551, "dn": 11552, "ellis</w>": 11553, "apollo</w>": 11554, "odds</w>": 11555, "refugee</w>": 11556, "tow": 11557, "bp</w>": 11558, "ðŁĩ¬ðŁĩ§</w>": 11559, "hend": 11560, "appeared</w>": 11561, "membership</w>": 11562, "pean": 11563, "dum</w>": 11564, "violent</w>": 11565, "vy": 11566, "potatoes</w>": 11567, "aww</w>": 11568, "greetings</w>": 11569, "tts</w>": 11570, "acon</w>": 11571, "shane</w>": 11572, "photographed</w>": 11573, "crab</w>": 11574, "temperatures</w>": 11575, "cuba</w>": 11576, "cfc</w>": 11577, "welcom": 11578, "hel</w>": 11579, "innings</w>": 11580, "mk": 11581, "code": 11582, "knock</w>": 11583, "grass": 11584, "swedish</w>": 11585, "pta</w>": 11586, "icky</w>": 11587, "vat": 11588, "lining</w>": 11589, "sq</w>": 11590, "sap</w>": 11591, "arc</w>": 11592, "announcing</w>": 11593, "skins</w>": 11594, "cityof": 11595, "bring": 11596, "cox</w>": 11597, "gamer</w>": 11598, "itarian</w>": 11599, "ida</w>": 11600, "hd": 11601, "rosse</w>": 11602, "sadly</w>": 11603, "geo</w>": 11604, "âļ¡ï¸ı</w>": 11605, "tags</w>": 11606, "father": 11607, "change": 11608, "lance</w>": 11609, "whiskey</w>": 11610, "adelaide</w>": 11611, "tec</w>": 11612, "stickers</w>": 11613, "market": 11614, "classy</w>": 11615, "badass</w>": 11616, "florence</w>": 11617, "liner</w>": 11618, "frost</w>": 11619, "kate": 11620, "acon": 11621, "scandal</w>": 11622, "essex</w>": 11623, "ðŁĺı": 11624, "vivi": 11625, "drill</w>": 11626, "bloggers</w>": 11627, "recommend": 11628, "dha": 11629, "acres</w>": 11630, "roma</w>": 11631, "buy": 11632, "grocer": 11633, "eria</w>": 11634, "mahar": 11635, "ffer</w>": 11636, "patterns</w>": 11637, "veri": 11638, "compu": 11639, "stev": 11640, "anga</w>": 11641, "mentor</w>": 11642, "doo</w>": 11643, "itali": 11644, "cdnpoli</w>": 11645, "only": 11646, "conduct</w>": 11647, "electro": 11648, "def</w>": 11649, "whale</w>": 11650, "preparation</w>": 11651, "bicycle</w>": 11652, "viral</w>": 11653, "turnout</w>": 11654, "brass</w>": 11655, "quad": 11656, "hospitality</w>": 11657, "packaging</w>": 11658, "dency</w>": 11659, "cemetery</w>": 11660, "aboard</w>": 11661, "dreaming</w>": 11662, "picture": 11663, "tall": 11664, "invent": 11665, "admi": 11666, "oe</w>": 11667, "temps</w>": 11668, "quan": 11669, "fundam": 11670, "promp": 11671, "residence</w>": 11672, "mud</w>": 11673, "souri</w>": 11674, "âĦ¢</w>": 11675, "graffiti</w>": 11676, "gif</w>": 11677, "dnd</w>": 11678, "comp</w>": 11679, "swar": 11680, "peeps</w>": 11681, "palestine</w>": 11682, "devils</w>": 11683, "sang</w>": 11684, "assistance</w>": 11685, "bike": 11686, "mississi": 11687, "interviewed</w>": 11688, "nephew</w>": 11689, "drums</w>": 11690, "vand": 11691, "gentlemen</w>": 11692, "nsw</w>": 11693, "insta</w>": 11694, "lebanon</w>": 11695, "eeee": 11696, "olivia</w>": 11697, "very": 11698, "rough": 11699, "industries</w>": 11700, "mation</w>": 11701, "ðŁĺĴ</w>": 11702, "barrel</w>": 11703, "nay": 11704, "pops</w>": 11705, "modern": 11706, "illy": 11707, "arest</w>": 11708, "onents</w>": 11709, "protecting</w>": 11710, "vans</w>": 11711, "eo</w>": 11712, "vikings</w>": 11713, "restaurants</w>": 11714, "reck": 11715, "jackie</w>": 11716, "andrew": 11717, "willing</w>": 11718, "heath</w>": 11719, "citizen": 11720, "discrimin": 11721, "à¹Ī</w>": 11722, "stuart</w>": 11723, "mys</w>": 11724, "hip": 11725, "transp": 11726, "\"?</w>": 11727, "tex</w>": 11728, "sushi</w>": 11729, "ked": 11730, "crossed</w>": 11731, "distur": 11732, "pedia</w>": 11733, "fate</w>": 11734, "somehow</w>": 11735, "moth</w>": 11736, "processing</w>": 11737, "iss": 11738, "rin</w>": 11739, "uts</w>": 11740, "yyc</w>": 11741, "vert</w>": 11742, "lgbt": 11743, "reid</w>": 11744, "onto": 11745, "arabia</w>": 11746, "habitat</w>": 11747, "==": 11748, "streak</w>": 11749, "simpson</w>": 11750, "addiction</w>": 11751, "wimble": 11752, "delivers</w>": 11753, "challenging</w>": 11754, "ðŁİ¶": 11755, "franch": 11756, "edu": 11757, "sme": 11758, "aids</w>": 11759, "hurst</w>": 11760, "tham": 11761, "tarian</w>": 11762, "remembered</w>": 11763, "palestinian</w>": 11764, "fees</w>": 11765, "trum": 11766, "sketch": 11767, "uru</w>": 11768, "fitting</w>": 11769, "jesse</w>": 11770, "ðŁĶ¥ðŁĶ¥</w>": 11771, "--------": 11772, "bach": 11773, "icia</w>": 11774, "colored</w>": 11775, "dah</w>": 11776, "associate</w>": 11777, "intel": 11778, "seller</w>": 11779, "pu</w>": 11780, "stuffed</w>": 11781, "acs</w>": 11782, "bs": 11783, "shin</w>": 11784, "cooperation</w>": 11785, "certificate</w>": 11786, "abu</w>": 11787, "ingredients</w>": 11788, "rev</w>": 11789, "inge": 11790, "elder": 11791, "christian": 11792, "bundle</w>": 11793, "thic</w>": 11794, "dirt</w>": 11795, "beijing</w>": 11796, "commit</w>": 11797, "teddy</w>": 11798, "edu</w>": 11799, "today": 11800, "sfield</w>": 11801, "wyn": 11802, "confirms</w>": 11803, "loo</w>": 11804, "jv</w>": 11805, "eness</w>": 11806, "alpha": 11807, "virus</w>": 11808, "arium</w>": 11809, "grind</w>": 11810, "bridges</w>": 11811, "introduction</w>": 11812, "polls</w>": 11813, "bacter": 11814, "zach</w>": 11815, "terminal</w>": 11816, "raiders</w>": 11817, "flavor</w>": 11818, "zombie</w>": 11819, "vod": 11820, "spreading</w>": 11821, "gameofthrones</w>": 11822, "efficiency</w>": 11823, "lately</w>": 11824, "alem</w>": 11825, "tweet": 11826, "crimes</w>": 11827, "cler": 11828, "dey</w>": 11829, "dged</w>": 11830, "hyun": 11831, "payments</w>": 11832, "circus</w>": 11833, "ðŁĺŃðŁĺŃ</w>": 11834, "missouri</w>": 11835, "lub</w>": 11836, "episodes</w>": 11837, "cage</w>": 11838, "pos</w>": 11839, "matching</w>": 11840, "tumblr</w>": 11841, "lined</w>": 11842, "gest": 11843, "ambi": 11844, "narr": 11845, "ington": 11846, "regul": 11847, "blown</w>": 11848, "isle</w>": 11849, "coco": 11850, "ondon</w>": 11851, "joshua</w>": 11852, "touring</w>": 11853, "sma</w>": 11854, "sausage</w>": 11855, "bestfriend</w>": 11856, "boeing</w>": 11857, "desire</w>": 11858, "savage</w>": 11859, "rapper</w>": 11860, "devo": 11861, "tear</w>": 11862, "takeover</w>": 11863, "cowboys</w>": 11864, "poker</w>": 11865, "parag": 11866, "ppe</w>": 11867, "hint</w>": 11868, "wears</w>": 11869, "seth</w>": 11870, "roles</w>": 11871, "lanc": 11872, "manga</w>": 11873, "format</w>": 11874, "flyer</w>": 11875, "cay": 11876, "moor</w>": 11877, "bake</w>": 11878, "splash</w>": 11879, "vad": 11880, "kerala</w>": 11881, "proceeds</w>": 11882, "silly</w>": 11883, "reflection</w>": 11884, "distr": 11885, "wid": 11886, "suit": 11887, "civic</w>": 11888, "yankees</w>": 11889, "byn</w>": 11890, "migration</w>": 11891, "distin": 11892, "orch": 11893, "femini": 11894, "qualifying</w>": 11895, "turi": 11896, "obe": 11897, "hundred</w>": 11898, "crap</w>": 11899, "wang</w>": 11900, "mathemat": 11901, "bure": 11902, "exposure</w>": 11903, "ferguson</w>": 11904, "semester</w>": 11905, "reserv": 11906, "plym": 11907, "ahu": 11908, "facial</w>": 11909, "wax</w>": 11910, "worried</w>": 11911, "cab</w>": 11912, "vio": 11913, "asa</w>": 11914, "cod</w>": 11915, "topics</w>": 11916, "pcs</w>": 11917, "halo</w>": 11918, "rescued</w>": 11919, "horizon</w>": 11920, "ark": 11921, "âļª": 11922, "holly</w>": 11923, "elf</w>": 11924, "ulti": 11925, "pup": 11926, "qualified</w>": 11927, "attendance</w>": 11928, "atively</w>": 11929, "destroy</w>": 11930, "yc</w>": 11931, "forth</w>": 11932, "photooftheday</w>": 11933, "cents</w>": 11934, "iceland</w>": 11935, "measures</w>": 11936, "desk": 11937, "portfolio</w>": 11938, "articles</w>": 11939, "directors</w>": 11940, "datab": 11941, "ew": 11942, "creepy</w>": 11943, "ounding</w>": 11944, "honoured</w>": 11945, "mist</w>": 11946, "jit": 11947, "mentioned</w>": 11948, "portable</w>": 11949, "itic</w>": 11950, "dann": 11951, "fridayfeeling</w>": 11952, "amid</w>": 11953, "tiger": 11954, "scrip": 11955, "helicopter</w>": 11956, "hardware</w>": 11957, "explor": 11958, "workplace</w>": 11959, "austria</w>": 11960, "beatles</w>": 11961, "bernar": 11962, "spider": 11963, "disco</w>": 11964, "cult</w>": 11965, "limits</w>": 11966, "shortly</w>": 11967, "final": 11968, "ninja</w>": 11969, "luke": 11970, "lebron</w>": 11971, "walmart</w>": 11972, "oil": 11973, "vanilla</w>": 11974, "shire": 11975, "yeg</w>": 11976, "aky</w>": 11977, "cs": 11978, "bler</w>": 11979, "collected</w>": 11980, "tg</w>": 11981, "rolled</w>": 11982, "specials</w>": 11983, "bff</w>": 11984, "pierre</w>": 11985, "shim": 11986, "vier</w>": 11987, "flashback</w>": 11988, "restoration</w>": 11989, "individuals</w>": 11990, "prod</w>": 11991, "freaking</w>": 11992, "turer</w>": 11993, "oa</w>": 11994, "refre": 11995, "moroc": 11996, "greet</w>": 11997, "reyn": 11998, "careful</w>": 11999, "ouring</w>": 12000, "ush": 12001, "isd</w>": 12002, "gill</w>": 12003, "view": 12004, "thunderstorm</w>": 12005, "bled</w>": 12006, "picnic</w>": 12007, "guardi": 12008, "pig": 12009, "ark</w>": 12010, "sylvania</w>": 12011, "banned</w>": 12012, "ucl": 12013, "vijay": 12014, "orium</w>": 12015, "avengers</w>": 12016, "believes</w>": 12017, "eur</w>": 12018, "monument</w>": 12019, "concerned</w>": 12020, "labs</w>": 12021, "berg": 12022, "aap": 12023, "vish": 12024, "singles</w>": 12025, "cancel": 12026, "zel</w>": 12027, "arab</w>": 12028, "ruth</w>": 12029, "tooth</w>": 12030, "arta</w>": 12031, "shaf": 12032, "chairs</w>": 12033, "rack</w>": 12034, "diseases</w>": 12035, "crowd": 12036, "cly": 12037, "flex</w>": 12038, "christma": 12039, "artificial</w>": 12040, "tomat": 12041, "fine": 12042, "draws</w>": 12043, "advocate</w>": 12044, "france": 12045, "ÙĬ": 12046, "ðŁĺ³": 12047, "heavy": 12048, "sour</w>": 12049, "comprehen": 12050, "noble</w>": 12051, "aap</w>": 12052, "hindu</w>": 12053, "coral</w>": 12054, "gars</w>": 12055, "owen</w>": 12056, "nl": 12057, "stall</w>": 12058, "yellow": 12059, "marina</w>": 12060, "inver": 12061, "support": 12062, "tough": 12063, "promises</w>": 12064, "pie": 12065, "masterpiece</w>": 12066, "score": 12067, "force": 12068, "mortg": 12069, "cryptocurrency</w>": 12070, "ox</w>": 12071, "rors</w>": 12072, "rockin</w>": 12073, "provin": 12074, "hog": 12075, "nostal": 12076, "oakland</w>": 12077, "patrick": 12078, "inclusion</w>": 12079, "traffic": 12080, "ahmed</w>": 12081, "aha": 12082, "luxury": 12083, "consecu": 12084, "demon</w>": 12085, "âĸº</w>": 12086, "blowing</w>": 12087, "stag": 12088, ":\"</w>": 12089, "encourage</w>": 12090, "bene": 12091, "skull</w>": 12092, "dodge</w>": 12093, "buster</w>": 12094, "kinson</w>": 12095, "witne": 12096, "error</w>": 12097, "lowest</w>": 12098, "fellow": 12099, "à°": 12100, "shre": 12101, "blur": 12102, "virgin</w>": 12103, "composer</w>": 12104, "slip</w>": 12105, "mornings</w>": 12106, "gains</w>": 12107, "table": 12108, "grain</w>": 12109, "arist</w>": 12110, "brazilian</w>": 12111, "wwe": 12112, "tues</w>": 12113, "ribbon</w>": 12114, "anag": 12115, "dist</w>": 12116, "sacrif": 12117, "embrace</w>": 12118, "entrepreneur": 12119, "affili": 12120, "deo</w>": 12121, "tali": 12122, "tourist</w>": 12123, "fatal</w>": 12124, "ìĬ": 12125, "automatic</w>": 12126, "ðŁĩµ": 12127, "weak": 12128, "welfare</w>": 12129, "confirm</w>": 12130, "benjamin</w>": 12131, "fights</w>": 12132, "alleged</w>": 12133, "mead": 12134, "struggling</w>": 12135, "prosecu": 12136, "chef": 12137, "Ã¨": 12138, "proposal</w>": 12139, "ern</w>": 12140, "ðŁĺĦ": 12141, "dyk</w>": 12142, "ongs</w>": 12143, "hong": 12144, "mack</w>": 12145, "melon</w>": 12146, "onent</w>": 12147, "rush": 12148, "dap": 12149, "toler": 12150, "propag": 12151, "cze": 12152, "translation</w>": 12153, "wallet</w>": 12154, "cottage</w>": 12155, "sail</w>": 12156, "constitution</w>": 12157, "ðŁĴĢ</w>": 12158, "munici": 12159, "favor</w>": 12160, "stormhour</w>": 12161, "ih": 12162, "ðŁĺĮ</w>": 12163, "approaching</w>": 12164, "pinned</w>": 12165, "jed": 12166, "nigerian</w>": 12167, "nach": 12168, "shat": 12169, "particularly</w>": 12170, "mcdon": 12171, "cameras</w>": 12172, "annie</w>": 12173, "administr": 12174, "heat": 12175, "electrical</w>": 12176, "charming</w>": 12177, "gibson</w>": 12178, "boutique</w>": 12179, "exposed</w>": 12180, "actor": 12181, "pillow</w>": 12182, "beaches</w>": 12183, "genuine</w>": 12184, "margaret</w>": 12185, "bennett</w>": 12186, "louisi": 12187, "positions</w>": 12188, "ely": 12189, "shiny</w>": 12190, "tention</w>": 12191, "architect</w>": 12192, "rental</w>": 12193, "acqui": 12194, "google": 12195, "subway</w>": 12196, "moment": 12197, "ðŁļ¨": 12198, "rim</w>": 12199, "methods</w>": 12200, "cycli": 12201, "norfolk</w>": 12202, "ÙĪ": 12203, "overwhel": 12204, "rapid</w>": 12205, "wear": 12206, "happybirthday</w>": 12207, "progressive</w>": 12208, "ðŁĴ¥": 12209, "cogn": 12210, "papa</w>": 12211, "fool</w>": 12212, "philosophy</w>": 12213, "polar</w>": 12214, "jimmy": 12215, "wig</w>": 12216, "ðŁĴĭ": 12217, "operating</w>": 12218, "reduction</w>": 12219, "phi</w>": 12220, "flags</w>": 12221, "tothe": 12222, "odi": 12223, "ares</w>": 12224, "koo": 12225, "kang": 12226, "arkansas</w>": 12227, "ashton</w>": 12228, "wimbledon</w>": 12229, "scifi</w>": 12230, "attractive</w>": 12231, "mississippi</w>": 12232, "logists</w>": 12233, "ralph</w>": 12234, "label": 12235, "graduates</w>": 12236, "maha": 12237, "hometown</w>": 12238, "âľĮï¸ı</w>": 12239, "founded</w>": 12240, "onthe": 12241, "liz</w>": 12242, "transl": 12243, "minimum</w>": 12244, "presti": 12245, "tam</w>": 12246, "generations</w>": 12247, "rebel</w>": 12248, "journalists</w>": 12249, "param": 12250, "mcm</w>": 12251, "acrylic</w>": 12252, "deaths</w>": 12253, "tesla</w>": 12254, "wt</w>": 12255, "bryant</w>": 12256, "jerus": 12257, "istanbul</w>": 12258, "muhammad</w>": 12259, "riley</w>": 12260, "kris</w>": 12261, "workshops</w>": 12262, "iso</w>": 12263, "counts</w>": 12264, "stret": 12265, "protected</w>": 12266, "trinity</w>": 12267, "manual</w>": 12268, "rhin": 12269, "ril": 12270, "pleasant</w>": 12271, "lemon": 12272, "nerd</w>": 12273, "harder</w>": 12274, "darren</w>": 12275, "bury": 12276, "rah": 12277, "basis</w>": 12278, "migu": 12279, "occasion</w>": 12280, "lists</w>": 12281, "âĿ¤ï¸ıâĿ¤ï¸ıâĿ¤ï¸ı</w>": 12282, "eb": 12283, "decre": 12284, "hampton</w>": 12285, "ìĿ´": 12286, "travis</w>": 12287, "transform</w>": 12288, "puerto</w>": 12289, "nhl": 12290, "avoc": 12291, "trips</w>": 12292, "unexpected</w>": 12293, "vet": 12294, "didyou": 12295, "barber</w>": 12296, "stages</w>": 12297, "mson</w>": 12298, "represented</w>": 12299, "fort": 12300, "lal": 12301, "pple</w>": 12302, "nicely</w>": 12303, "ignore</w>": 12304, "quil": 12305, "quinn</w>": 12306, "hk</w>": 12307, "carrier</w>": 12308, "reminded</w>": 12309, "among": 12310, "passenger</w>": 12311, "ellen</w>": 12312, "guez</w>": 12313, "scape</w>": 12314, "mural</w>": 12315, "youngest</w>": 12316, "mash": 12317, "dill": 12318, "routine</w>": 12319, "stainless</w>": 12320, "jackson": 12321, "gandhi</w>": 12322, "thal</w>": 12323, "oners</w>": 12324, "editorial</w>": 12325, "conversations</w>": 12326, "sdale</w>": 12327, "automation</w>": 12328, "ike": 12329, "à¸²à¸": 12330, "ðŁĩª</w>": 12331, "haul</w>": 12332, "laying</w>": 12333, "mentions</w>": 12334, "amen</w>": 12335, "abortion</w>": 12336, "ibi": 12337, "counties</w>": 12338, "catherine</w>": 12339, "mands</w>": 12340, "jame": 12341, "roller</w>": 12342, "aut</w>": 12343, "nam</w>": 12344, "ological</w>": 12345, "ception</w>": 12346, "ranking</w>": 12347, "toxic</w>": 12348, "snacks</w>": 12349, "victorian</w>": 12350, "bangkok</w>": 12351, "psychology</w>": 12352, "reg</w>": 12353, "angela</w>": 12354, "respond</w>": 12355, "style": 12356, "sophie</w>": 12357, "dakota</w>": 12358, "achieved</w>": 12359, "marked</w>": 12360, "imperial</w>": 12361, "inas</w>": 12362, "gloves</w>": 12363, "slim</w>": 12364, "confident</w>": 12365, "attacked</w>": 12366, "gger": 12367, "lonely</w>": 12368, "valentinesday</w>": 12369, "reb": 12370, "craftbeer</w>": 12371, "origin</w>": 12372, "zimbab": 12373, "ceiling</w>": 12374, "teens</w>": 12375, "otherwise</w>": 12376, "wb</w>": 12377, "fers</w>": 12378, "daysof": 12379, "advisor</w>": 12380, "yah</w>": 12381, "âĻª</w>": 12382, "ender</w>": 12383, "republicans</w>": 12384, "ava</w>": 12385, "skirt</w>": 12386, "pipel": 12387, "chie</w>": 12388, "jane": 12389, "jax</w>": 12390, "ðŁĺĭ": 12391, "âľĬ": 12392, "jays</w>": 12393, "brett</w>": 12394, "balo": 12395, "crucial</w>": 12396, "dhar": 12397, "asis</w>": 12398, "deau</w>": 12399, "lloyd</w>": 12400, "chatting</w>": 12401, "âĿĦï¸ı</w>": 12402, "relay</w>": 12403, "remarkable</w>": 12404, "ns": 12405, "wet": 12406, "brisbane</w>": 12407, "ðŁĶ´": 12408, "tionally</w>": 12409, "fk</w>": 12410, "layer</w>": 12411, "household</w>": 12412, "consecutive</w>": 12413, "esis</w>": 12414, "pendant</w>": 12415, "stir": 12416, "critic": 12417, "sugar": 12418, "photoshop</w>": 12419, "pares</w>": 12420, "artistic</w>": 12421, "dodgers</w>": 12422, "cun": 12423, "crafted</w>": 12424, "amend": 12425, "boat": 12426, "âŃĲï¸ı": 12427, "egyptian</w>": 12428, "saw": 12429, "trage": 12430, "smaller</w>": 12431, "oxy": 12432, "paired</w>": 12433, "next": 12434, "ires</w>": 12435, "taco</w>": 12436, "oy</w>": 12437, "uc</w>": 12438, "sti</w>": 12439, "aerial</w>": 12440, "://</w>": 12441, "dro</w>": 12442, "dotcom</w>": 12443, "ggins</w>": 12444, "rpg</w>": 12445, "aye</w>": 12446, "lean": 12447, "striker</w>": 12448, "lobby</w>": 12449, "protests</w>": 12450, "priority</w>": 12451, "congress": 12452, "amate": 12453, "invit": 12454, "rington</w>": 12455, "mommy</w>": 12456, "thus</w>": 12457, "allowing</w>": 12458, "pioneer</w>": 12459, "enforcement</w>": 12460, "gori": 12461, "talk": 12462, "drag</w>": 12463, "dumb</w>": 12464, "bullet</w>": 12465, "sange": 12466, "ery": 12467, "targets</w>": 12468, "ðŁĩ¦": 12469, "heather</w>": 12470, "consider": 12471, "seafood</w>": 12472, "vest</w>": 12473, "risks</w>": 12474, "%.</w>": 12475, "pg": 12476, "sacred</w>": 12477, "heating</w>": 12478, "kicked</w>": 12479, "ttot</w>": 12480, ".-</w>": 12481, "chandi": 12482, "coven": 12483, "pool": 12484, "pulse</w>": 12485, "ia": 12486, "roster</w>": 12487, "shakespeare</w>": 12488, "esa</w>": 12489, "cargo</w>": 12490, "peanut</w>": 12491, "troop": 12492, "action": 12493, "tablet</w>": 12494, "homework</w>": 12495, "castle": 12496, "struction</w>": 12497, "musicians</w>": 12498, "freezing</w>": 12499, "butt": 12500, "justinbieber</w>": 12501, "jj": 12502, "bahrain</w>": 12503, "anthem</w>": 12504, "audit": 12505, "didyouknow</w>": 12506, "navig": 12507, "guidance</w>": 12508, "âĸ¶": 12509, "turf</w>": 12510, "nun": 12511, "fications</w>": 12512, "yemen</w>": 12513, "charging</w>": 12514, "xc</w>": 12515, "broncos</w>": 12516, "subur": 12517, "pale</w>": 12518, "boring</w>": 12519, "amongst</w>": 12520, "forthe": 12521, "emper": 12522, "omfg</w>": 12523, "pj": 12524, "expecting</w>": 12525, "ðŁĴ«</w>": 12526, "stl</w>": 12527, "admin</w>": 12528, "expectations</w>": 12529, "swan</w>": 12530, "shoot": 12531, "ooooo</w>": 12532, "minent</w>": 12533, "ãĢĲ</w>": 12534, "wallace</w>": 12535, "stang</w>": 12536, "saturday": 12537, "adopted</w>": 12538, "doubles</w>": 12539, "homie</w>": 12540, "omez</w>": 12541, "dhan": 12542, "venture</w>": 12543, "surrounding</w>": 12544, "file": 12545, "mobility</w>": 12546, "dees</w>": 12547, "wski</w>": 12548, "brooke</w>": 12549, "embro": 12550, "remembers</w>": 12551, "kara</w>": 12552, "testim": 12553, "botan": 12554, "mtv</w>": 12555, "sacrifice</w>": 12556, "jerusalem</w>": 12557, "dl": 12558, "Â´</w>": 12559, "properly</w>": 12560, "ilion</w>": 12561, "asi</w>": 12562, "legit</w>": 12563, "cope</w>": 12564, "mcla": 12565, "recycling</w>": 12566, "larger</w>": 12567, "ðŁĴĵ</w>": 12568, "patric": 12569, "generous</w>": 12570, "jared</w>": 12571, "pf</w>": 12572, "molly</w>": 12573, "thomas": 12574, "judges</w>": 12575, "hb</w>": 12576, "sorts</w>": 12577, "blvd</w>": 12578, "oven</w>": 12579, "entering</w>": 12580, "planes</w>": 12581, "beet": 12582, "integration</w>": 12583, "booked</w>": 12584, "freed": 12585, "vern</w>": 12586, "ashes</w>": 12587, "topped</w>": 12588, "depot</w>": 12589, "welcomed</w>": 12590, "rena</w>": 12591, "mick</w>": 12592, "dand": 12593, "seeks</w>": 12594, "gamer": 12595, "rankings</w>": 12596, "rene</w>": 12597, "mut": 12598, "whisky</w>": 12599, "firefighters</w>": 12600, "gues</w>": 12601, "gather</w>": 12602, "tourney</w>": 12603, "demen": 12604, "yang</w>": 12605, "newton</w>": 12606, "automotive</w>": 12607, "backyard</w>": 12608, "detailed</w>": 12609, "mist": 12610, "tobac": 12611, "fiber</w>": 12612, "unusual</w>": 12613, "gratitude</w>": 12614, "spare</w>": 12615, "neys</w>": 12616, ":*</w>": 12617, "peri": 12618, "floating</w>": 12619, "finalist</w>": 12620, "donating</w>": 12621, "dress": 12622, "broad</w>": 12623, "bethe": 12624, "economics</w>": 12625, "taiwan</w>": 12626, "edwards</w>": 12627, "plug</w>": 12628, "prairi": 12629, "valen": 12630, "baba</w>": 12631, "fad": 12632, "anas</w>": 12633, "harper</w>": 12634, "disorder</w>": 12635, "applied</w>": 12636, "patt": 12637, "bikin": 12638, "liver</w>": 12639, "curi": 12640, "caroline</w>": 12641, "anner</w>": 12642, "julian</w>": 12643, "walking": 12644, "malcol": 12645, "screenshot</w>": 12646, "coding</w>": 12647, "skincare</w>": 12648, "activists</w>": 12649, "mysterious</w>": 12650, "exact</w>": 12651, "blocking</w>": 12652, "mercury</w>": 12653, "batter": 12654, "dump": 12655, "âľĮ</w>": 12656, "ense": 12657, "lish": 12658, "ridiculous</w>": 12659, "protesters</w>": 12660, "ðŁĻĪ": 12661, "lust</w>": 12662, "sweat</w>": 12663, "ass": 12664, "alike</w>": 12665, "cody</w>": 12666, "rements</w>": 12667, "winds": 12668, "aspir": 12669, "vienna</w>": 12670, "pray": 12671, "...@</w>": 12672, "boi</w>": 12673, "candle</w>": 12674, "assists</w>": 12675, "tee": 12676, "derson</w>": 12677, "pony</w>": 12678, "fence</w>": 12679, "conspir": 12680, "âĺħâĺħ": 12681, "ooth</w>": 12682, "epic": 12683, "barely</w>": 12684, "aunt</w>": 12685, "bam</w>": 12686, "diamonds</w>": 12687, "endless</w>": 12688, "screens</w>": 12689, "cancer": 12690, "gro</w>": 12691, "pst</w>": 12692, "prospec": 12693, "mosque</w>": 12694, "helpful</w>": 12695, "ouri": 12696, "brother": 12697, "gujar": 12698, "cristi": 12699, "inez</w>": 12700, "towers</w>": 12701, "addresses</w>": 12702, "gray": 12703, "burton</w>": 12704, "retweeted</w>": 12705, "ðŁ¤Ķ": 12706, "nity</w>": 12707, "duck": 12708, "supervis": 12709, "joan</w>": 12710, "kinder": 12711, "sanctu": 12712, "pied</w>": 12713, "âı°</w>": 12714, "łï¸ı</w>": 12715, "mati": 12716, "revenge</w>": 12717, "cester</w>": 12718, "elife</w>": 12719, "designers</w>": 12720, "backed</w>": 12721, "boli": 12722, "weight": 12723, "couch</w>": 12724, "sures</w>": 12725, "sits</w>": 12726, "shrimp</w>": 12727, "lagos</w>": 12728, "authorities</w>": 12729, "osity</w>": 12730, "holly": 12731, "computing</w>": 12732, "factors</w>": 12733, "abe</w>": 12734, "panels</w>": 12735, "ramad": 12736, "sentence</w>": 12737, "mission": 12738, "holm</w>": 12739, "rb": 12740, "dads</w>": 12741, "shanghai</w>": 12742, "money": 12743, "sheets</w>": 12744, "skate</w>": 12745, "threw</w>": 12746, "cupcakes</w>": 12747, "infinite</w>": 12748, "lis</w>": 12749, "practicing</w>": 12750, "essay</w>": 12751, "kai": 12752, "asci": 12753, "mob</w>": 12754, "ugh</w>": 12755, "holmes</w>": 12756, "regg": 12757, "ikh</w>": 12758, "mock</w>": 12759, "collections</w>": 12760, "pep": 12761, "ova</w>": 12762, "salt": 12763, "nandez</w>": 12764, "coy": 12765, "threats</w>": 12766, "texts</w>": 12767, "cinnam": 12768, "pregnancy</w>": 12769, "pending</w>": 12770, "stamp</w>": 12771, "flower": 12772, "gis</w>": 12773, "agreed</w>": 12774, "payne</w>": 12775, "rover</w>": 12776, "phra": 12777, "soft": 12778, "ffin": 12779, "fathers</w>": 12780, "passengers</w>": 12781, "aways</w>": 12782, "ala": 12783, "hes</w>": 12784, "livan</w>": 12785, "ins": 12786, "samuel</w>": 12787, "ingui": 12788, "hof</w>": 12789, "jj</w>": 12790, "chennai</w>": 12791, "catal": 12792, "omic</w>": 12793, "heath": 12794, "niece</w>": 12795, "pumped</w>": 12796, "integrated</w>": 12797, "arel</w>": 12798, "nom</w>": 12799, "productivity</w>": 12800, "wanting</w>": 12801, "visa</w>": 12802, "diana</w>": 12803, "twil": 12804, "itv</w>": 12805, "camps</w>": 12806, "rowing</w>": 12807, "dley</w>": 12808, "blackand": 12809, "guards</w>": 12810, "bells</w>": 12811, "reverse</w>": 12812, "vibe</w>": 12813, "ricky</w>": 12814, "moss</w>": 12815, "nyt</w>": 12816, "âĺĢï¸ı": 12817, "elle": 12818, "troy</w>": 12819, "cudd": 12820, "evan": 12821, "womens": 12822, "foto</w>": 12823, "mistakes</w>": 12824, "wicked</w>": 12825, "mil</w>": 12826, "cled</w>": 12827, "memes</w>": 12828, "cosmo": 12829, "scholar</w>": 12830, "reno</w>": 12831, "ðŁĺĢ": 12832, "vents</w>": 12833, "#âĢ¦</w>": 12834, "terrorists</w>": 12835, "casey</w>": 12836, "cardinals</w>": 12837, "ðŁĺĬðŁĺĬ": 12838, "venezuela</w>": 12839, "bola</w>": 12840, "literacy</w>": 12841, "tw</w>": 12842, "eno</w>": 12843, "contains</w>": 12844, "austin": 12845, "financi": 12846, "evan</w>": 12847, "harvard</w>": 12848, "originally</w>": 12849, "chevro": 12850, "herald</w>": 12851, "nottingham</w>": 12852, "managers</w>": 12853, "âŀ¡</w>": 12854, "accepting</w>": 12855, "walsh</w>": 12856, "tutorial</w>": 12857, "entrepreneurship</w>": 12858, "yacht</w>": 12859, "requirements</w>": 12860, "glenn</w>": 12861, "pede": 12862, "unfortunately</w>": 12863, "aching</w>": 12864, "daisy</w>": 12865, "gian</w>": 12866, "nightmare</w>": 12867, "âĿĹ": 12868, "rina</w>": 12869, "bart</w>": 12870, "emails</w>": 12871, "opposite</w>": 12872, "whom</w>": 12873, "sake</w>": 12874, "puzzle</w>": 12875, "dashi": 12876, "party": 12877, "blanket</w>": 12878, "buses</w>": 12879, "lore": 12880, "beauty": 12881, "reason": 12882, "punjab</w>": 12883, "windsor</w>": 12884, "functional</w>": 12885, "existing</w>": 12886, "hello": 12887, "glimp": 12888, "convin": 12889, "lak": 12890, "screaming</w>": 12891, "rebecca</w>": 12892, "bliss</w>": 12893, "northwest</w>": 12894, "infinity</w>": 12895, "cosmetics</w>": 12896, "pulling</w>": 12897, "coffee": 12898, "pling</w>": 12899, "opho": 12900, "colombia</w>": 12901, "interiordesign</w>": 12902, "(+</w>": 12903, "emotions</w>": 12904, "sac</w>": 12905, "sunglasses</w>": 12906, "saves</w>": 12907, "df": 12908, "sixth</w>": 12909, "aly</w>": 12910, "ðŁĺ»</w>": 12911, "deen</w>": 12912, "devast": 12913, "politicians</w>": 12914, "lacrosse</w>": 12915, "gu</w>": 12916, "pei</w>": 12917, "java</w>": 12918, "combine</w>": 12919, "coalition</w>": 12920, "erts</w>": 12921, "surviv": 12922, "chad</w>": 12923, "strian</w>": 12924, "nn</w>": 12925, "devi": 12926, "counc": 12927, "concern</w>": 12928, "controller</w>": 12929, "breast": 12930, "jury</w>": 12931, "tum": 12932, "introduces</w>": 12933, "ladi": 12934, "mobile": 12935, "alz": 12936, "steady</w>": 12937, "nurses</w>": 12938, "hacking</w>": 12939, "online": 12940, "ocean": 12941, "ðŁİĦ": 12942, "aam": 12943, "juven": 12944, "icc</w>": 12945, "louisiana</w>": 12946, "arte</w>": 12947, "streetart</w>": 12948, "ison": 12949, "wns</w>": 12950, "frm</w>": 12951, "panda</w>": 12952, "noir</w>": 12953, "maintain</w>": 12954, "delay": 12955, "symptoms</w>": 12956, "thorn": 12957, "geome": 12958, "tern</w>": 12959, "carried</w>": 12960, "pru": 12961, "panor": 12962, "assy</w>": 12963, "peru</w>": 12964, "cloud": 12965, "spra": 12966, "pedi": 12967, "este": 12968, "tagged</w>": 12969, "ðŁĺĿ</w>": 12970, "shadows</w>": 12971, "nazi</w>": 12972, "Ø§ÙĦ": 12973, "corri": 12974, "âĻ¥âĻ¥": 12975, "jad": 12976, "ðŁĩ«": 12977, "formal</w>": 12978, "spoken</w>": 12979, "ðŁĮŀ</w>": 12980, "enjoy": 12981, "lopez</w>": 12982, "outlook</w>": 12983, "inho</w>": 12984, "wander": 12985, "Ùħ": 12986, "maya</w>": 12987, "pee": 12988, "dine</w>": 12989, "ãĢĳ</w>": 12990, "briefing</w>": 12991, "supporter</w>": 12992, "arily</w>": 12993, "ghters</w>": 12994, "naturally</w>": 12995, "doctorwho</w>": 12996, "jen</w>": 12997, "var</w>": 12998, "newyear</w>": 12999, "rese": 13000, "simm": 13001, "rex": 13002, "consequ": 13003, "tomatoes</w>": 13004, "burst</w>": 13005, "bravo</w>": 13006, "burgers</w>": 13007, "cracking</w>": 13008, "northeast</w>": 13009, "biom": 13010, "mushroom</w>": 13011, "marque": 13012, "double": 13013, "nier</w>": 13014, "vag": 13015, "twenty</w>": 13016, "keyboard</w>": 13017, "winni": 13018, "jamaica</w>": 13019, "parish</w>": 13020, ":-": 13021, "mentalhealth</w>": 13022, "alizing</w>": 13023, "render</w>": 13024, "waking</w>": 13025, "ðŁİĤ": 13026, "gly": 13027, "nathan": 13028, "washing</w>": 13029, "melissa</w>": 13030, "jung</w>": 13031, "loyal</w>": 13032, "chili</w>": 13033, "songwriter</w>": 13034, "guitarist</w>": 13035, "bowie</w>": 13036, "neighbors</w>": 13037, "onymous</w>": 13038, "asset</w>": 13039, "tai</w>": 13040, "headquarters</w>": 13041, "ðŁĮĪ</w>": 13042, "ihear": 13043, "cigare": 13044, "surg": 13045, ")\"</w>": 13046, "repl": 13047, "darling</w>": 13048, "ðŁĻĦ</w>": 13049, "zak": 13050, "sare": 13051, "ãħĭ": 13052, "mickey</w>": 13053, "warehouse</w>": 13054, "massage</w>": 13055, "inees</w>": 13056, "didnt</w>": 13057, "iw": 13058, "hurts</w>": 13059, "engaging</w>": 13060, "magic": 13061, "womenin": 13062, "kitten</w>": 13063, "mors</w>": 13064, "cart</w>": 13065, "titans</w>": 13066, "colleague</w>": 13067, "competing</w>": 13068, "eran</w>": 13069, "khal": 13070, "marble</w>": 13071, "demand": 13072, "delight</w>": 13073, "etary</w>": 13074, "blizz": 13075, "louise</w>": 13076, "mls</w>": 13077, "finishes</w>": 13078, "experiment</w>": 13079, "conducted</w>": 13080, "electronics</w>": 13081, "itters</w>": 13082, "caring</w>": 13083, "whats</w>": 13084, "symbol</w>": 13085, "jung": 13086, "ecu": 13087, "pix</w>": 13088, "context</w>": 13089, "charger</w>": 13090, "ðŁĺĩ</w>": 13091, "reig": 13092, "frag": 13093, "ëĭ": 13094, "chad": 13095, "true": 13096, "kerry</w>": 13097, "defending</w>": 13098, "aint</w>": 13099, "auton": 13100, "checkout</w>": 13101, "barnes</w>": 13102, "lessly</w>": 13103, "dt": 13104, "mme</w>": 13105, "cloudy</w>": 13106, "secondary</w>": 13107, "arez</w>": 13108, "_:</w>": 13109, "appa</w>": 13110, "constant</w>": 13111, "\")</w>": 13112, "vets</w>": 13113, "job": 13114, "ient</w>": 13115, "ðŁĺŃðŁĺŃðŁĺŃ</w>": 13116, "mj": 13117, "french": 13118, "diver": 13119, "davies</w>": 13120, "hhhh</w>": 13121, "ebook</w>": 13122, "à¹ī</w>": 13123, "mariti": 13124, "breeze</w>": 13125, "suspended</w>": 13126, "mato": 13127, "viet</w>": 13128, "rahu": 13129, "sei": 13130, "bolt</w>": 13131, "enary</w>": 13132, "leis": 13133, "karl</w>": 13134, "framed</w>": 13135, "explaining</w>": 13136, "abc": 13137, "dealing</w>": 13138, "nato</w>": 13139, "jake": 13140, "expand</w>": 13141, "leonard</w>": 13142, "established</w>": 13143, "dub</w>": 13144, "armen": 13145, "elled</w>": 13146, "vocal</w>": 13147, "nicholas</w>": 13148, "orient": 13149, "kyo": 13150, "illustrated</w>": 13151, "ahh</w>": 13152, "dancers</w>": 13153, "million": 13154, "geta": 13155, "popp": 13156, "asu": 13157, "murdered</w>": 13158, "gible</w>": 13159, "stoked</w>": 13160, "griffin</w>": 13161, "maximum</w>": 13162, "adrian</w>": 13163, "encounter</w>": 13164, "thero": 13165, "davidson</w>": 13166, "ðŁį»</w>": 13167, "holiday": 13168, "evo</w>": 13169, "assets</w>": 13170, "carson</w>": 13171, "memorable</w>": 13172, "âļ½</w>": 13173, "obam": 13174, "representative</w>": 13175, "cbd</w>": 13176, "tricks</w>": 13177, "vogue</w>": 13178, "voice": 13179, "mmmm</w>": 13180, "sebastian</w>": 13181, "clif": 13182, "athy</w>": 13183, "paralle": 13184, "ðŁ¤·": 13185, "pak": 13186, "evacu": 13187, "eats</w>": 13188, "Ø§Ø": 13189, "touched</w>": 13190, "organised</w>": 13191, "spirits</w>": 13192, "canad": 13193, "guided</w>": 13194, "framework</w>": 13195, "ðŁĮŁ": 13196, "ped": 13197, "natural": 13198, "agar": 13199, "replaced</w>": 13200, "anchor</w>": 13201, "tit</w>": 13202, "shah": 13203, "organis": 13204, "superior</w>": 13205, "rn": 13206, "chro": 13207, "erica</w>": 13208, "still": 13209, "coron": 13210, "chuck": 13211, "locks</w>": 13212, "organ</w>": 13213, "rosen": 13214, "scam</w>": 13215, "bened": 13216, "/#</w>": 13217, "keen</w>": 13218, "trevor</w>": 13219, "vampire</w>": 13220, "sorted</w>": 13221, "!'</w>": 13222, "afford</w>": 13223, "intro</w>": 13224, "grace": 13225, "ðŁĺľ": 13226, "saur</w>": 13227, "kickstarter</w>": 13228, "influen": 13229, "vu</w>": 13230, "yup</w>": 13231, "poc": 13232, "ðŁİ¥</w>": 13233, "aar</w>": 13234, "sang": 13235, "trek": 13236, "etsy": 13237, "tbh</w>": 13238, "scream</w>": 13239, "chevrolet</w>": 13240, "pixel</w>": 13241, "shepherd</w>": 13242, "anor": 13243, "gabriel</w>": 13244, "twood</w>": 13245, "sdcc</w>": 13246, "meters</w>": 13247, "developers</w>": 13248, "closure</w>": 13249, "vw</w>": 13250, "twitch": 13251, "ìĹ": 13252, "seoul</w>": 13253, "price": 13254, "hog</w>": 13255, "nish</w>": 13256, "hillary": 13257, "scratch</w>": 13258, "incen": 13259, "wagon</w>": 13260, "disability</w>": 13261, "panther</w>": 13262, "chats</w>": 13263, "gd": 13264, "witz</w>": 13265, "sussex</w>": 13266, "late": 13267, "denmark</w>": 13268, "gerald</w>": 13269, "cancelled</w>": 13270, "nette</w>": 13271, "ix": 13272, "naval</w>": 13273, "baptist</w>": 13274, "tet</w>": 13275, "yad": 13276, "math": 13277, "hoy</w>": 13278, "randy</w>": 13279, "point": 13280, "intellec": 13281, "fruits</w>": 13282, "wool</w>": 13283, "guin": 13284, "pron": 13285, "theft</w>": 13286, "condem": 13287, "marry</w>": 13288, "nola</w>": 13289, "architects</w>": 13290, "cincin": 13291, "rockets</w>": 13292, "gentleman</w>": 13293, "explan": 13294, "tate</w>": 13295, "doe</w>": 13296, "raises</w>": 13297, "wildlife": 13298, "wl": 13299, "insider</w>": 13300, "blanc</w>": 13301, "wp</w>": 13302, "forsale</w>": 13303, "nyc": 13304, "powell</w>": 13305, "unbelievable</w>": 13306, "pens": 13307, "goodies</w>": 13308, "mustang</w>": 13309, "pens</w>": 13310, "stays</w>": 13311, "squash</w>": 13312, "xoxo</w>": 13313, "nearby</w>": 13314, "everton</w>": 13315, "coco</w>": 13316, "leagu": 13317, "khan": 13318, "stud</w>": 13319, "southwest</w>": 13320, "construc": 13321, "sworth</w>": 13322, "croatia</w>": 13323, "lea</w>": 13324, "sums</w>": 13325, "aims</w>": 13326, "ean</w>": 13327, "vaness": 13328, "itious</w>": 13329, "pathy</w>": 13330, "arcade</w>": 13331, "bend</w>": 13332, "suggests</w>": 13333, "sacram": 13334, "royals</w>": 13335, "rier</w>": 13336, "emir": 13337, "incl</w>": 13338, "ank": 13339, "clark": 13340, "right": 13341, "vacc": 13342, "à¤¾</w>": 13343, "tane": 13344, "lib</w>": 13345, "usc": 13346, "sales": 13347, "huh</w>": 13348, "sally</w>": 13349, "vera</w>": 13350, "pga</w>": 13351, "grows</w>": 13352, "drum": 13353, "tree": 13354, "ethics</w>": 13355, "suggest</w>": 13356, "isab": 13357, "sealed</w>": 13358, "previously</w>": 13359, "animated</w>": 13360, "abdu": 13361, "rises</w>": 13362, "glob": 13363, "predat": 13364, "scarf</w>": 13365, "delic": 13366, "omar</w>": 13367, "lli</w>": 13368, "sxsw</w>": 13369, "python</w>": 13370, "nebra": 13371, "funk</w>": 13372, "reflect</w>": 13373, "pavilion</w>": 13374, "tically</w>": 13375, "chasing</w>": 13376, "bakery</w>": 13377, "invasion</w>": 13378, "koh": 13379, "believed</w>": 13380, "cohen</w>": 13381, "conqu": 13382, "crafts</w>": 13383, "nati</w>": 13384, "clever</w>": 13385, "governance</w>": 13386, "samples</w>": 13387, "fails</w>": 13388, "âĶ": 13389, "timo": 13390, "ritu": 13391, "striking</w>": 13392, "inclusive</w>": 13393, "shocking</w>": 13394, "cant": 13395, "requires</w>": 13396, "drawings</w>": 13397, "à¸Ń": 13398, "purchased</w>": 13399, "dum": 13400, "zach": 13401, "warner</w>": 13402, "console</w>": 13403, "mansion</w>": 13404, "fountain</w>": 13405, "circum": 13406, "esh</w>": 13407, "island": 13408, "milk": 13409, "profits</w>": 13410, "halifax</w>": 13411, "rival": 13412, "âľĪï¸ı</w>": 13413, "jenny</w>": 13414, "sandra</w>": 13415, "nye</w>": 13416, "kelly": 13417, "yal</w>": 13418, "quad</w>": 13419, "nos</w>": 13420, "instein</w>": 13421, "finalists</w>": 13422, "midfielder</w>": 13423, "cue</w>": 13424, "exceptional</w>": 13425, "aan</w>": 13426, "sapp": 13427, "gettin</w>": 13428, "saa</w>": 13429, "fati": 13430, "slice</w>": 13431, "volk": 13432, "swal": 13433, "lasting</w>": 13434, "summary</w>": 13435, "itas</w>": 13436, "smo</w>": 13437, "sz": 13438, "âĺĨ</w>": 13439, "ipl</w>": 13440, "flames</w>": 13441, "enews</w>": 13442, "hav": 13443, "hoodie</w>": 13444, "pitcher</w>": 13445, "windy</w>": 13446, "revol": 13447, "central": 13448, "tonite</w>": 13449, "ðŁİīðŁİī</w>": 13450, "solved</w>": 13451, "milwau": 13452, "organizations</w>": 13453, "weets</w>": 13454, "refin": 13455, "sth": 13456, "ãĥ¼": 13457, "elin</w>": 13458, "tona</w>": 13459, "cinnamon</w>": 13460, "ðŁİ¨</w>": 13461, "ðŁİģ</w>": 13462, "ronaldo</w>": 13463, "peninsu": 13464, "omega</w>": 13465, "elds</w>": 13466, "designing</w>": 13467, "eigh": 13468, "bluet": 13469, "benz</w>": 13470, "nug": 13471, "asha</w>": 13472, "robots</w>": 13473, "sudan</w>": 13474, "choosing</w>": 13475, "endo": 13476, "serge": 13477, "closely</w>": 13478, "handy</w>": 13479, "finger": 13480, "being": 13481, "arte": 13482, "survived</w>": 13483, "flame</w>": 13484, "milestone</w>": 13485, "gut</w>": 13486, "dwar": 13487, "futures</w>": 13488, "Ã©e</w>": 13489, "elo</w>": 13490, "fridge</w>": 13491, "elic</w>": 13492, "ouch</w>": 13493, "ub</w>": 13494, "pv</w>": 13495, "titan": 13496, "collar</w>": 13497, "station": 13498, "nevada</w>": 13499, "aurora</w>": 13500, "rd": 13501, "duncan</w>": 13502, "âģł</w>": 13503, "brien</w>": 13504, "marsh</w>": 13505, "Ð¾": 13506, "total": 13507, "chry": 13508, "sers</w>": 13509, "suffe": 13510, "rachel": 13511, "college": 13512, "todays</w>": 13513, "courts</w>": 13514, "chit": 13515, "reunited</w>": 13516, "gymna": 13517, "genesis</w>": 13518, "beside</w>": 13519, "representation</w>": 13520, "chant</w>": 13521, "collector</w>": 13522, "rak": 13523, "athens</w>": 13524, "nigh": 13525, "munich</w>": 13526, "languages</w>": 13527, "flu</w>": 13528, "participation</w>": 13529, "___</w>": 13530, "cv": 13531, "spectrum</w>": 13532, "soda</w>": 13533, "cover": 13534, "referen": 13535, "abbo": 13536, "apa</w>": 13537, "publication</w>": 13538, "edm</w>": 13539, "monica</w>": 13540, "army": 13541, "ðŁļĢ</w>": 13542, "divor": 13543, "dry": 13544, "streams</w>": 13545, "robotics</w>": 13546, "cider</w>": 13547, "bullying</w>": 13548, "approval</w>": 13549, "stoke</w>": 13550, "platforms</w>": 13551, "sierra</w>": 13552, "extin": 13553, "ib</w>": 13554, "hayes</w>": 13555, "succeed</w>": 13556, "suffer</w>": 13557, "atically</w>": 13558, "dai": 13559, "lynch</w>": 13560, "hound</w>": 13561, "delines</w>": 13562, "acknow": 13563, "dated</w>": 13564, "exclusively</w>": 13565, "heres</w>": 13566, "facilit": 13567, "damaged</w>": 13568, "charter</w>": 13569, "lakers</w>": 13570, "falcon</w>": 13571, "unveiled</w>": 13572, "welove": 13573, "ease</w>": 13574, "patience</w>": 13575, "lone</w>": 13576, "gentle</w>": 13577, "genetic</w>": 13578, "producing</w>": 13579, "gour": 13580, "shannon</w>": 13581, "bilities</w>": 13582, "zimbabwe</w>": 13583, "pint</w>": 13584, "daughters</w>": 13585, "literary</w>": 13586, "belle": 13587, "clam": 13588, "surrounded</w>": 13589, "kany": 13590, "neil": 13591, "pirate</w>": 13592, "ranger</w>": 13593, "hbd</w>": 13594, "natalie</w>": 13595, "belong</w>": 13596, "olympi": 13597, "embassy</w>": 13598, "scol": 13599, "ener</w>": 13600, "akin</w>": 13601, "loren": 13602, "bh</w>": 13603, ":/</w>": 13604, "diva</w>": 13605, "denim</w>": 13606, "hipp": 13607, "ðŁĩµðŁĩ": 13608, "arnold</w>": 13609, "?'</w>": 13610, "weren</w>": 13611, "empower": 13612, "disabled</w>": 13613, "manor</w>": 13614, "raspberry</w>": 13615, "baf": 13616, "awful</w>": 13617, "drummer</w>": 13618, "kardashi": 13619, "nash</w>": 13620, "machinelearning</w>": 13621, "chu</w>": 13622, "rebels</w>": 13623, "timing</w>": 13624, "monroe</w>": 13625, "tongue</w>": 13626, "range": 13627, "pupils</w>": 13628, "ress</w>": 13629, "amazon": 13630, "bz</w>": 13631, "harley</w>": 13632, "palmer</w>": 13633, "balloon</w>": 13634, "sings</w>": 13635, "icec": 13636, "jb</w>": 13637, "cers</w>": 13638, "gps</w>": 13639, "whist": 13640, "rise": 13641, "lt": 13642, "oooo</w>": 13643, "cattle</w>": 13644, "shooter</w>": 13645, "vodka</w>": 13646, "ucl</w>": 13647, "mtg</w>": 13648, "lesli": 13649, "jonas</w>": 13650, "dispo": 13651, "atric</w>": 13652, "stein": 13653, "vintage": 13654, "firms</w>": 13655, "floyd</w>": 13656, "cowboy</w>": 13657, "soooo</w>": 13658, "isaac</w>": 13659, "warcraft</w>": 13660, "disneyland</w>": 13661, "beautiful": 13662, "beam</w>": 13663, "franchise</w>": 13664, "bun</w>": 13665, "kag": 13666, "anon</w>": 13667, "turbo</w>": 13668, "sweep</w>": 13669, "madein": 13670, "karachi</w>": 13671, "detective</w>": 13672, "pennsylvania</w>": 13673, "controversi": 13674, "vitamin</w>": 13675, "aside</w>": 13676, "chronic</w>": 13677, "describes</w>": 13678, "removal</w>": 13679, "hah</w>": 13680, "aper": 13681, "tened</w>": 13682, "uto</w>": 13683, "badly</w>": 13684, "mirac": 13685, "fry</w>": 13686, "yea</w>": 13687, "injec": 13688, "thermal</w>": 13689, "compact</w>": 13690, "thor</w>": 13691, "teed</w>": 13692, "urgent</w>": 13693, "lite</w>": 13694, "gilli": 13695, "sophom": 13696, "ico": 13697, "chem</w>": 13698, "pm": 13699, "fork</w>": 13700, "freak</w>": 13701, "chak": 13702, "recipient</w>": 13703, "iy": 13704, "nik</w>": 13705, "modeling</w>": 13706, "cans</w>": 13707, "ðŁıĢ": 13708, "delux": 13709, "seam": 13710, "survivors</w>": 13711, "radical</w>": 13712, "investigating</w>": 13713, "reliable</w>": 13714, "fm": 13715, "turt": 13716, "lighthouse</w>": 13717, "tool": 13718, "gown</w>": 13719, "))": 13720, "bots</w>": 13721, "autograph</w>": 13722, "aid": 13723, "buffe": 13724, "hmm</w>": 13725, "horrible</w>": 13726, "ssional</w>": 13727, "anni</w>": 13728, "à¹Ģ": 13729, "kits</w>": 13730, "schi": 13731, "eternal</w>": 13732, "huss": 13733, "sensitive</w>": 13734, "ru</w>": 13735, "tastes</w>": 13736, "checks</w>": 13737, "imo</w>": 13738, "portion</w>": 13739, "skate": 13740, "eden</w>": 13741, "halftime</w>": 13742, "fried": 13743, "rihanna</w>": 13744, "tise</w>": 13745, "flick": 13746, "cain</w>": 13747, "sgt</w>": 13748, "âľĶ</w>": 13749, "shau": 13750, "stained</w>": 13751, "raffle</w>": 13752, "drove</w>": 13753, "salman": 13754, "principles</w>": 13755, "sho</w>": 13756, "aru": 13757, "jess</w>": 13758, "guine": 13759, "garbage</w>": 13760, "myan": 13761, "jelly</w>": 13762, "disru": 13763, "zia</w>": 13764, "qld</w>": 13765, "entries</w>": 13766, "lav": 13767, "flew</w>": 13768, "admit</w>": 13769, "objects</w>": 13770, "compare</w>": 13771, "nytimes</w>": 13772, "cannes</w>": 13773, "pn</w>": 13774, "suffol": 13775, "roc</w>": 13776, "dana</w>": 13777, "egg": 13778, "hist</w>": 13779, "counsel": 13780, "'!</w>": 13781, "physi": 13782, "imagination</w>": 13783, "adjust": 13784, "explosion</w>": 13785, "plymouth</w>": 13786, "horror": 13787, "elliott</w>": 13788, "bourne": 13789, "dex</w>": 13790, "breed</w>": 13791, "audio": 13792, "lobster</w>": 13793, "disappointed</w>": 13794, "nationwide</w>": 13795, "((</w>": 13796, "increases</w>": 13797, "australi": 13798, "cedar</w>": 13799, "staring</w>": 13800, "racial</w>": 13801, "eis": 13802, "gmt</w>": 13803, "visions</w>": 13804, "stayed</w>": 13805, "discussions</w>": 13806, "dean": 13807, "curtis</w>": 13808, "maiden</w>": 13809, "stellar</w>": 13810, "happiest</w>": 13811, "hwy</w>": 13812, "preseason</w>": 13813, "carav": 13814, "mondays</w>": 13815, "hospitals</w>": 13816, "glimpse</w>": 13817, "scholars</w>": 13818, "jai</w>": 13819, "terrace</w>": 13820, "anna": 13821, "goose</w>": 13822, "graded</w>": 13823, "lotus</w>": 13824, "hung</w>": 13825, "grocery</w>": 13826, "stamps</w>": 13827, "emperor</w>": 13828, "scoop</w>": 13829, "inser": 13830, "cas</w>": 13831, "existence</w>": 13832, "heal</w>": 13833, "falcons</w>": 13834, "marvel": 13835, "reducing</w>": 13836, "terrific</w>": 13837, "magnetic</w>": 13838, "performs</w>": 13839, "barre": 13840, "pus</w>": 13841, "treating</w>": 13842, "icon": 13843, "wh</w>": 13844, "declared</w>": 13845, "trauma</w>": 13846, "dod": 13847, "comedian</w>": 13848, "nikon</w>": 13849, "bugs</w>": 13850, "asm</w>": 13851, "montgom": 13852, "ibiza</w>": 13853, "comprehensive</w>": 13854, "has": 13855, "santi": 13856, "fellowship</w>": 13857, "dash": 13858, "psal": 13859, "louisville</w>": 13860, "spy": 13861, "fault</w>": 13862, "dthe": 13863, "filed</w>": 13864, "vista</w>": 13865, "desc": 13866, "fears</w>": 13867, "youtu": 13868, "sps</w>": 13869, "esp</w>": 13870, "rig</w>": 13871, "crime": 13872, "berger</w>": 13873, "wonderland</w>": 13874, "kent": 13875, "informed</w>": 13876, "stevens</w>": 13877, "myth</w>": 13878, "aston</w>": 13879, "iri</w>": 13880, "visitor</w>": 13881, "atri": 13882, "producers</w>": 13883, "alla": 13884, "personally</w>": 13885, "separate</w>": 13886, "agencies</w>": 13887, "afri": 13888, "ilan": 13889, "spoke": 13890, "nina</w>": 13891, "squad": 13892, "dives</w>": 13893, "depend": 13894, "liv": 13895, "fierce</w>": 13896, "entertaining</w>": 13897, "chain": 13898, "scat": 13899, "borders</w>": 13900, "palette</w>": 13901, "spro": 13902, "osis</w>": 13903, "derby": 13904, "tobacco</w>": 13905, "zio</w>": 13906, "willie</w>": 13907, "juvent": 13908, "zoom</w>": 13909, "holy": 13910, "entirely</w>": 13911, "afe</w>": 13912, "martinez</w>": 13913, "beds</w>": 13914, "pea</w>": 13915, "bulldogs</w>": 13916, "ðŁĩªðŁĩ": 13917, "ibm</w>": 13918, "neon</w>": 13919, "ethiopia</w>": 13920, "teammates</w>": 13921, "planting</w>": 13922, "twer": 13923, "anytime</w>": 13924, "forbes</w>": 13925, "Ã³n</w>": 13926, "runway</w>": 13927, "nervous</w>": 13928, "roger": 13929, "pile</w>": 13930, "chanc": 13931, "apocaly": 13932, "uw": 13933, "oi</w>": 13934, "drought</w>": 13935, "territory</w>": 13936, "brick": 13937, "creatures</w>": 13938, "goin</w>": 13939, "waff": 13940, "gren": 13941, "southeast</w>": 13942, "jean": 13943, "ambul": 13944, "edited</w>": 13945, "strap</w>": 13946, "cv</w>": 13947, "aaron": 13948, "ãĥ»ãĥ»": 13949, "tsu": 13950, "description</w>": 13951, "kindly</w>": 13952, "clutch</w>": 13953, "immer": 13954, "enor": 13955, "womensday</w>": 13956, "orange": 13957, "rag": 13958, "obvious</w>": 13959, "hyder": 13960, "channels</w>": 13961, "mango</w>": 13962, "meyer</w>": 13963, "raining</w>": 13964, "getty</w>": 13965, "pilgri": 13966, "coordinator</w>": 13967, "upload</w>": 13968, "nintendo": 13969, "donuts</w>": 13970, "sanchez</w>": 13971, "apparel</w>": 13972, "jr": 13973, "zzi</w>": 13974, ",@</w>": 13975, "jefferson</w>": 13976, "accessible</w>": 13977, "greatly</w>": 13978, "eid</w>": 13979, "initial</w>": 13980, "buddha</w>": 13981, "paris": 13982, "mascot</w>": 13983, "â¬ĩï¸ı</w>": 13984, "schwar": 13985, "siri": 13986, "spinning</w>": 13987, "mortgage</w>": 13988, "echo</w>": 13989, "endange": 13990, "gedly</w>": 13991, "chloe</w>": 13992, "enhance</w>": 13993, "karnat": 13994, "kry": 13995, "explores</w>": 13996, "ðŁĴģ": 13997, "affair</w>": 13998, "icals</w>": 13999, "alla</w>": 14000, "dart": 14001, "dolphins</w>": 14002, "differences</w>": 14003, "squirrel</w>": 14004, "augh</w>": 14005, "drones</w>": 14006, "ellen": 14007, "restore</w>": 14008, "paw": 14009, "unfor": 14010, "pike</w>": 14011, "hilton</w>": 14012, "collab</w>": 14013, "consumers</w>": 14014, "coinci": 14015, "outcomes</w>": 14016, "ppp</w>": 14017, "aq": 14018, "coupon</w>": 14019, "liest</w>": 14020, "sims</w>": 14021, "kho": 14022, "aves</w>": 14023, "spoon</w>": 14024, "pudding</w>": 14025, "corbyn</w>": 14026, "haters</w>": 14027, "exams</w>": 14028, "slave</w>": 14029, ".!</w>": 14030, "psa</w>": 14031, "apples</w>": 14032, "tamil</w>": 14033, "sed": 14034, "coke</w>": 14035, "zzo</w>": 14036, "losange": 14037, "carbon": 14038, "clair</w>": 14039, "...)</w>": 14040, "khu": 14041, "craig": 14042, "exploration</w>": 14043, "sanctuary</w>": 14044, "sue": 14045, "alway": 14046, "dementia</w>": 14047, "wonders</w>": 14048, "superhero</w>": 14049, "pakistani</w>": 14050, "browns</w>": 14051, "bluetooth</w>": 14052, "locker</w>": 14053, "marc": 14054, "eventu": 14055, "deluxe</w>": 14056, "rodriguez</w>": 14057, "âĿ¤âĿ¤</w>": 14058, "robb": 14059, "ðŁĴ¦</w>": 14060, "linux</w>": 14061, "tens</w>": 14062, "intelligent</w>": 14063, "seed": 14064, "voter</w>": 14065, "sler</w>": 14066, "peaks</w>": 14067, "intern</w>": 14068, "teenage</w>": 14069, "peninsula</w>": 14070, "handling</w>": 14071, "tie": 14072, "cousins</w>": 14073, "wendy</w>": 14074, "mee</w>": 14075, "à¹Ģà¸": 14076, "dino</w>": 14077, "ðŁĴ°</w>": 14078, "ðŁĺĥ": 14079, "zee</w>": 14080, "sbury</w>": 14081, "tragedy</w>": 14082, "bk</w>": 14083, "bore": 14084, "zin": 14085, "warns</w>": 14086, "idiot</w>": 14087, "touching</w>": 14088, "continental</w>": 14089, "tacos</w>": 14090, "safari</w>": 14091, "washed</w>": 14092, "podium</w>": 14093, "morrison</w>": 14094, "forests</w>": 14095, "cbc": 14096, "alon": 14097, "particular</w>": 14098, "beads</w>": 14099, "invented</w>": 14100, "loch</w>": 14101, "lighter</w>": 14102, "wherever</w>": 14103, "ide</w>": 14104, "documents</w>": 14105, "awe</w>": 14106, "kr</w>": 14107, "nowhere</w>": 14108, "miner": 14109, "stit": 14110, "rox": 14111, "contribute</w>": 14112, "hardy</w>": 14113, "clan</w>": 14114, "object</w>": 14115, "cait": 14116, "ðŁĴķðŁĴķ</w>": 14117, "happier</w>": 14118, "vegetables</w>": 14119, "tart</w>": 14120, "gag": 14121, "nominee</w>": 14122, "heavily</w>": 14123, "panic</w>": 14124, "jd</w>": 14125, "theresa</w>": 14126, "atm</w>": 14127, "uph": 14128, "sfc</w>": 14129, "suri": 14130, "drink": 14131, "nal": 14132, "revel": 14133, "kl</w>": 14134, "avocado</w>": 14135, "nomination</w>": 14136, "madonna</w>": 14137, "sharon</w>": 14138, "malcolm</w>": 14139, "controlled</w>": 14140, "shers</w>": 14141, "revival</w>": 14142, "legislation</w>": 14143, "shoots</w>": 14144, "nin</w>": 14145, "commentary</w>": 14146, "pros</w>": 14147, "humanrights</w>": 14148, "stranger</w>": 14149, "mitch</w>": 14150, "pipeline</w>": 14151, "legally</w>": 14152, "thu</w>": 14153, "gilbert</w>": 14154, "toll</w>": 14155, "granted</w>": 14156, "ghs</w>": 14157, "iranian</w>": 14158, "refreshing</w>": 14159, "duk</w>": 14160, "abi</w>": 14161, "prime": 14162, "joseph": 14163, "mosa": 14164, "statistics</w>": 14165, "productions</w>": 14166, "merry": 14167, "patel</w>": 14168, "sax": 14169, "humanitarian</w>": 14170, "structures</w>": 14171, "emissions</w>": 14172, "towns</w>": 14173, "freel": 14174, "stering</w>": 14175, "ratings</w>": 14176, "allegedly</w>": 14177, "cabin</w>": 14178, "stl": 14179, "wade</w>": 14180, "flyers</w>": 14181, "trim</w>": 14182, "promising</w>": 14183, "zu</w>": 14184, "ballot</w>": 14185, "comparison</w>": 14186, "freeze</w>": 14187, "outer</w>": 14188, "greatness</w>": 14189, "assign": 14190, "snowy</w>": 14191, "rale": 14192, "tories</w>": 14193, "mediter": 14194, "knock": 14195, "consultant</w>": 14196, "cincinnati</w>": 14197, "analyst</w>": 14198, "scoo": 14199, "jews</w>": 14200, "approxim": 14201, "pure": 14202, "portraits</w>": 14203, "cyrus</w>": 14204, "ational": 14205, "loans</w>": 14206, "acquis": 14207, "elu": 14208, "acceptable</w>": 14209, "union": 14210, "watercolor</w>": 14211, "rust</w>": 14212, "battles</w>": 14213, "perfu": 14214, "seasonal</w>": 14215, "serial</w>": 14216, "mindset</w>": 14217, "riot</w>": 14218, "feld</w>": 14219, "ennial</w>": 14220, "closet</w>": 14221, "priest</w>": 14222, "tanks</w>": 14223, "intl</w>": 14224, "screw</w>": 14225, "bum</w>": 14226, "abdul": 14227, "oux</w>": 14228, "explained</w>": 14229, "rica</w>": 14230, "imaging</w>": 14231, "lawyers</w>": 14232, "buried</w>": 14233, "ãĥ»ãĥ»ãĥ»</w>": 14234, "earl</w>": 14235, "âĢķ</w>": 14236, "lton</w>": 14237, "restored</w>": 14238, "stripes</w>": 14239, "foss": 14240, "demands</w>": 14241, "stealing</w>": 14242, "alexis</w>": 14243, "mund</w>": 14244, "aker": 14245, "urus</w>": 14246, "wardro": 14247, "hugs</w>": 14248, "genre</w>": 14249, "ego</w>": 14250, "ÙĦ": 14251, "participated</w>": 14252, "babes</w>": 14253, "banquet</w>": 14254, "tious</w>": 14255, "hemi": 14256, "dsb</w>": 14257, "lost": 14258, "milwaukee</w>": 14259, "jenner</w>": 14260, "gem": 14261, "outra": 14262, "loses</w>": 14263, "idi</w>": 14264, "reps</w>": 14265, "ðŁİ§</w>": 14266, "regulation</w>": 14267, "flaw": 14268, "fang": 14269, "vibrant</w>": 14270, "ramp</w>": 14271, "rains</w>": 14272, "wellbeing</w>": 14273, "soviet</w>": 14274, "viewers</w>": 14275, "depo": 14276, "libraries</w>": 14277, "bigo": 14278, "sery</w>": 14279, "gill": 14280, "destruction</w>": 14281, "coz</w>": 14282, "cx</w>": 14283, "bridal</w>": 14284, "alds</w>": 14285, "planted</w>": 14286, "amateur</w>": 14287, "lud": 14288, "cheering</w>": 14289, "showcas": 14290, "profile": 14291, "iu": 14292, "vertical</w>": 14293, "packers</w>": 14294, "wizard</w>": 14295, "skip</w>": 14296, "slight</w>": 14297, "beau</w>": 14298, "airways</w>": 14299, "much": 14300, "rera</w>": 14301, "ðŁĮĬ</w>": 14302, "absor": 14303, "patio</w>": 14304, "packages</w>": 14305, "sells</w>": 14306, "mentally</w>": 14307, "ðŁĺ¢": 14308, "reynolds</w>": 14309, "kare": 14310, "tribun": 14311, "walt</w>": 14312, "knit</w>": 14313, "taste": 14314, "surrey</w>": 14315, "bounce</w>": 14316, "creature</w>": 14317, "bare</w>": 14318, "betting</w>": 14319, "sure": 14320, "miley</w>": 14321, "laughs</w>": 14322, "alore</w>": 14323, "cyn": 14324, "tl": 14325, "artist": 14326, "annah</w>": 14327, "warmer</w>": 14328, "dynamics</w>": 14329, "lunchtime</w>": 14330, "maritime</w>": 14331, "vulnerable</w>": 14332, "ðŁĴĥ</w>": 14333, "wolver": 14334, "durham</w>": 14335, "constantly</w>": 14336, "amin": 14337, "sibl": 14338, ":@</w>": 14339, "bullet": 14340, "kach": 14341, "angelo</w>": 14342, "wilder": 14343, "doom</w>": 14344, "desktop</w>": 14345, "lawsuit</w>": 14346, "kca</w>": 14347, "henderson</w>": 14348, "inviting</w>": 14349, "betty</w>": 14350, "tawards</w>": 14351, "rafa": 14352, "leaked</w>": 14353, "andi</w>": 14354, "gems</w>": 14355, "afl</w>": 14356, "velo": 14357, "mediterran": 14358, "probe</w>": 14359, "totten": 14360, "stephanie</w>": 14361, "snation</w>": 14362, "combe</w>": 14363, "qs</w>": 14364, "overcome</w>": 14365, "assassin": 14366, "rav": 14367, "filip": 14368, "winnipeg</w>": 14369, "shil": 14370, "determined</w>": 14371, "kas</w>": 14372, "outre": 14373, "regret</w>": 14374, "guides</w>": 14375, "aaa": 14376, "ðŁĺĪ": 14377, "wives</w>": 14378, "manife": 14379, "erly</w>": 14380, "smy": 14381, "shima</w>": 14382, "xing</w>": 14383, "pixel": 14384, "jacob": 14385, "accommod": 14386, "toy": 14387, "ono</w>": 14388, "poo</w>": 14389, "tier": 14390, "answe": 14391, "ðŁĴģ</w>": 14392, "rosa</w>": 14393, "lease</w>": 14394, "belongs</w>": 14395, "thar": 14396, "eventually</w>": 14397, "neither</w>": 14398, "goa</w>": 14399, "skiing</w>": 14400, "atra</w>": 14401, "agh</w>": 14402, "broadcasting</w>": 14403, "fury</w>": 14404, "pyram": 14405, "dice</w>": 14406, "volkswag": 14407, "womens</w>": 14408, "provider</w>": 14409, "bombs</w>": 14410, "missile</w>": 14411, "whip</w>": 14412, "dick": 14413, "norwe": 14414, "backup</w>": 14415, "elder</w>": 14416, "mature</w>": 14417, "concerts</w>": 14418, "gious</w>": 14419, "squee": 14420, "goodmorning</w>": 14421, "braves</w>": 14422, "^_": 14423, "aussie</w>": 14424, "luna</w>": 14425, "males</w>": 14426, "heck</w>": 14427, "fortn": 14428, "romeo</w>": 14429, "steelers</w>": 14430, "pn": 14431, "peer</w>": 14432, "represents</w>": 14433, "Â«</w>": 14434, "katy</w>": 14435, "miguel</w>": 14436, "require</w>": 14437, "chains</w>": 14438, "lur": 14439, "immediate</w>": 14440, "timber": 14441, "âĸ¶ï¸ı</w>": 14442, "advocacy</w>": 14443, "export</w>": 14444, "anz": 14445, "tiffany</w>": 14446, "author": 14447, "ðŁİĪ</w>": 14448, "dudes</w>": 14449, "chilly</w>": 14450, "hid</w>": 14451, "harm</w>": 14452, "bug": 14453, "monster": 14454, "terrier</w>": 14455, "tuc": 14456, "storytelling</w>": 14457, "tak</w>": 14458, "inti": 14459, "immigrants</w>": 14460, "bis</w>": 14461, "reaches</w>": 14462, "compassion</w>": 14463, "johnny": 14464, "contributions</w>": 14465, "ðŁĲ¶": 14466, "mechanical</w>": 14467, "impression</w>": 14468, "ranks</w>": 14469, "kobe</w>": 14470, "menting</w>": 14471, "blossom</w>": 14472, "pablo</w>": 14473, "builder</w>": 14474, "bombing</w>": 14475, "twel": 14476, "sullivan</w>": 14477, "omo": 14478, "pete": 14479, "demi</w>": 14480, "kudos</w>": 14481, "wbb</w>": 14482, "tgif</w>": 14483, "massach": 14484, "neighbor</w>": 14485, "chefs</w>": 14486, "engines</w>": 14487, "pune</w>": 14488, "gained</w>": 14489, "phantom</w>": 14490, "sdays</w>": 14491, "extend</w>": 14492, "gran</w>": 14493, "centers</w>": 14494, "jacqu": 14495, "datasci": 14496, "sleepy</w>": 14497, "elvis</w>": 14498, "answered</w>": 14499, "slot</w>": 14500, "cony</w>": 14501, "flexible</w>": 14502, "tially</w>": 14503, "letics</w>": 14504, "%,</w>": 14505, "andrews</w>": 14506, "sible</w>": 14507, "momma</w>": 14508, "vino</w>": 14509, "dox</w>": 14510, "invitational</w>": 14511, "twilight</w>": 14512, "jade</w>": 14513, "illery</w>": 14514, "johns</w>": 14515, "fou": 14516, "pv": 14517, "---></w>": 14518, "breakdown</w>": 14519, "billion": 14520, "printer</w>": 14521, "mond": 14522, "cbc</w>": 14523, "maggie</w>": 14524, "legion</w>": 14525, "dub": 14526, "kurt</w>": 14527, "poor": 14528, "parenting</w>": 14529, "regions</w>": 14530, "bikini</w>": 14531, "beware</w>": 14532, "sional</w>": 14533, "auburn</w>": 14534, "kidding</w>": 14535, "amples</w>": 14536, "span</w>": 14537, "contempor": 14538, "cic": 14539, "habits</w>": 14540, "ako</w>": 14541, "prefe": 14542, "buddies</w>": 14543, "itz": 14544, "emily": 14545, "personnel</w>": 14546, "mountain": 14547, "versus</w>": 14548, "ðŁĺ¬</w>": 14549, "earning</w>": 14550, "sink</w>": 14551, "dari</w>": 14552, "uu</w>": 14553, "swin": 14554, "ister</w>": 14555, "brutal</w>": 14556, "nac": 14557, "kata</w>": 14558, "cloth</w>": 14559, "amand": 14560, "ðŁĶĹ</w>": 14561, "neo": 14562, "alumin": 14563, "weekends</w>": 14564, "nebraska</w>": 14565, "codes</w>": 14566, "delayed</w>": 14567, "bruno</w>": 14568, "proven</w>": 14569, "inc": 14570, "ight": 14571, "flan": 14572, "oro</w>": 14573, "lambert</w>": 14574, "regulat": 14575, "wf": 14576, "massachuse": 14577, "kardashian</w>": 14578, "bernard</w>": 14579, "fiesta</w>": 14580, "volcano</w>": 14581, "grandpa</w>": 14582, "anca</w>": 14583, "dre</w>": 14584, "stitu": 14585, "meaning": 14586, "foam</w>": 14587, "auck": 14588, "ated": 14589, "rl</w>": 14590, "hotel": 14591, "persons</w>": 14592, "dynasty</w>": 14593, "ellor</w>": 14594, "mai</w>": 14595, "amne": 14596, "styling</w>": 14597, "avier</w>": 14598, "eg</w>": 14599, "vegetarian</w>": 14600, ",âĢ¦</w>": 14601, "founders</w>": 14602, "stain</w>": 14603, "gd</w>": 14604, "cycles</w>": 14605, "skyline</w>": 14606, "tractor</w>": 14607, "exists</w>": 14608, "tral</w>": 14609, "kidney</w>": 14610, "maril": 14611, "instag": 14612, "sette</w>": 14613, "addict</w>": 14614, "triangle</w>": 14615, "flashback": 14616, "controversial</w>": 14617, "zon</w>": 14618, "pins</w>": 14619, "ias</w>": 14620, "tray</w>": 14621, "township</w>": 14622, "delegates</w>": 14623, "spam</w>": 14624, "hms</w>": 14625, "crane</w>": 14626, "peoples</w>": 14627, "olo": 14628, "faction</w>": 14629, "butes</w>": 14630, "onica</w>": 14631, "delegation</w>": 14632, "newprofile": 14633, "elier</w>": 14634, "mca</w>": 14635, "wand": 14636, "gely</w>": 14637, "losangeles</w>": 14638, "berke": 14639, "tive": 14640, "disrup": 14641, "zza</w>": 14642, "casa</w>": 14643, "jordan": 14644, "fordshire</w>": 14645, "gathered</w>": 14646, "ichi</w>": 14647, "attendees</w>": 14648, "à¸Ńà¸": 14649, "peppers</w>": 14650, "coin": 14651, "bourbon</w>": 14652, "ernity</w>": 14653, "rotary</w>": 14654, "behaviour</w>": 14655, "jeremy": 14656, "teamwork</w>": 14657, "compliance</w>": 14658, "tremend": 14659, "ðŁĩ§": 14660, "buhari</w>": 14661, "cambo": 14662, "buyers</w>": 14663, "hagen</w>": 14664, "buds</w>": 14665, "bayern</w>": 14666, "monte</w>": 14667, "smells</w>": 14668, "anza</w>": 14669, "athlon</w>": 14670, "described</w>": 14671, "workforce</w>": 14672, "giving": 14673, "api</w>": 14674, "investments</w>": 14675, "dail": 14676, "selena</w>": 14677, "database</w>": 14678, "thum": 14679, "mortal</w>": 14680, "student": 14681, "buyer</w>": 14682, "dover</w>": 14683, "garten</w>": 14684, "attle": 14685, "loyalty</w>": 14686, "genoci": 14687, "holocau": 14688, "theaters</w>": 14689, "ruling</w>": 14690, "venus</w>": 14691, "patent</w>": 14692, "chun": 14693, "abby</w>": 14694, "awake</w>": 14695, "massacre</w>": 14696, "bangalore</w>": 14697, "breaking": 14698, "simmons</w>": 14699, "justi": 14700, "hale</w>": 14701, "edchat</w>": 14702, "ggles</w>": 14703, "hawk": 14704, "marking</w>": 14705, "headlines</w>": 14706, "strom</w>": 14707, "cove</w>": 14708, "breathtaking</w>": 14709, "medals</w>": 14710, "haircut</w>": 14711, "christine</w>": 14712, "telegraph</w>": 14713, "gujarat</w>": 14714, "jura": 14715, "cane</w>": 14716, "shore": 14717, "propaganda</w>": 14718, "mueller</w>": 14719, "........</w>": 14720, "savi": 14721, "stomach</w>": 14722, "throws</w>": 14723, "tab</w>": 14724, "warm": 14725, "jong</w>": 14726, "renowned</w>": 14727, "hir</w>": 14728, "rais": 14729, "mushrooms</w>": 14730, "guaranteed</w>": 14731, "boa</w>": 14732, "mj</w>": 14733, "revolutionary</w>": 14734, "certification</w>": 14735, "bruins</w>": 14736, "join": 14737, "wes</w>": 14738, "passport</w>": 14739, "cg</w>": 14740, "sexu": 14741, "capable</w>": 14742, "wv</w>": 14743, "tones</w>": 14744, "jackets</w>": 14745, "accompan": 14746, "spinach</w>": 14747, "forever": 14748, "blair</w>": 14749, "watts</w>": 14750, "gl</w>": 14751, "couples</w>": 14752, "prairie</w>": 14753, "newprofilepic</w>": 14754, "logistics</w>": 14755, "massachusetts</w>": 14756, "jaguar</w>": 14757, "oid</w>": 14758, "weal": 14759, "underwater</w>": 14760, "moz": 14761, "yi</w>": 14762, "maths</w>": 14763, "myanmar</w>": 14764, "preps</w>": 14765, "suffered</w>": 14766, "trace</w>": 14767, "wali</w>": 14768, "ahhh</w>": 14769, "borg</w>": 14770, "stitch</w>": 14771, "culin": 14772, "realise</w>": 14773, "infection</w>": 14774, "discrimination</w>": 14775, "shame": 14776, "ankle</w>": 14777, "humid": 14778, "yt</w>": 14779, "bracket</w>": 14780, "truck": 14781, "triu": 14782, "easter": 14783, "community": 14784, "postcard</w>": 14785, "involving</w>": 14786, "tyler": 14787, "caramel</w>": 14788, "overview</w>": 14789, "examples</w>": 14790, "integrity</w>": 14791, "basement</w>": 14792, "instruments</w>": 14793, "anium</w>": 14794, "atus</w>": 14795, "gher</w>": 14796, "laundry</w>": 14797, "achieve": 14798, "geneva</w>": 14799, "pricing</w>": 14800, "hyderabad</w>": 14801, "belief</w>": 14802, "meta": 14803, "jaw": 14804, "accounting</w>": 14805, "leader": 14806, "cristiano</w>": 14807, "couture</w>": 14808, "cyp": 14809, "vised</w>": 14810, ",,,</w>": 14811, "knu": 14812, "hick": 14813, "breaker</w>": 14814, "bram": 14815, "rab": 14816, "moor": 14817, "hamas</w>": 14818, "graduating</w>": 14819, "puppies</w>": 14820, "akh": 14821, "tah": 14822, "aches</w>": 14823, "rie": 14824, "opini": 14825, "gta": 14826, "reign</w>": 14827, "tragic</w>": 14828, "rever": 14829, "pill": 14830, "pineapple</w>": 14831, "touches</w>": 14832, "dare": 14833, "leys</w>": 14834, "ilo</w>": 14835, "interiors</w>": 14836, "scouts</w>": 14837, "bart": 14838, "enzie</w>": 14839, "dono": 14840, "brock": 14841, "christians</w>": 14842, "ensemble</w>": 14843, "Â·</w>": 14844, "cinemas</w>": 14845, "newport</w>": 14846, "airline</w>": 14847, "winston</w>": 14848, "leigh": 14849, "contents</w>": 14850, "prescri": 14851, "urge</w>": 14852, "trout</w>": 14853, "fically</w>": 14854, "ilia</w>": 14855, "subsi": 14856, "arer</w>": 14857, "âļ¾ï¸ı</w>": 14858, "wounded</w>": 14859, "ðŁĻĤ</w>": 14860, "pepper": 14861, "ðŁĴŀ": 14862, "fitted</w>": 14863, "aff</w>": 14864, "resur": 14865, "thursdaythoughts</w>": 14866, "zero": 14867, "archaeology</w>": 14868, "div</w>": 14869, "jee</w>": 14870, "ion": 14871, "awaiting</w>": 14872, "cozy</w>": 14873, "beauties</w>": 14874, "bald</w>": 14875, "data": 14876, "grizz": 14877, "stalk</w>": 14878, "kinds</w>": 14879, "cleared</w>": 14880, "jessic": 14881, "regular": 14882, "aliens</w>": 14883, "place": 14884, "bos": 14885, "bizar": 14886, "thisis": 14887, "ðŁĴĢ": 14888, "tottenham</w>": 14889, "mafia</w>": 14890, "slam": 14891, "ariana</w>": 14892, "carroll</w>": 14893, "backpack</w>": 14894, "carey</w>": 14895, "univ</w>": 14896, "rg</w>": 14897, "pep</w>": 14898, "digit": 14899, "tattoos</w>": 14900, "agon</w>": 14901, "volunteering</w>": 14902, "differen": 14903, "consumption</w>": 14904, "kathr": 14905, "headphones</w>": 14906, "tshirt</w>": 14907, "ob</w>": 14908, "element</w>": 14909, "retail": 14910, "shru": 14911, "algori": 14912, "container</w>": 14913, "conscious</w>": 14914, "fil</w>": 14915, "coming": 14916, "rash": 14917, "urope</w>": 14918, "define</w>": 14919, "gior": 14920, "feminist</w>": 14921, "flowing</w>": 14922, "routes</w>": 14923, "glaci": 14924, "fert": 14925, "somerset</w>": 14926, "antes</w>": 14927, "tweeps</w>": 14928, "$$</w>": 14929, "hour": 14930, "endangered</w>": 14931, "yearsof": 14932, "roh": 14933, "popped</w>": 14934, "backing</w>": 14935, "basil</w>": 14936, "brake</w>": 14937, "monaco</w>": 14938, "lgbtq</w>": 14939, "prague</w>": 14940, "utility</w>": 14941, "cassi": 14942, "gateway</w>": 14943, "haunted</w>": 14944, "schul": 14945, "ðŁİµ</w>": 14946, "should": 14947, "walkingdead</w>": 14948, "completing</w>": 14949, "danny": 14950, "montgomery</w>": 14951, "penguin</w>": 14952, "ssi</w>": 14953, "merchandi": 14954, "ðŁĳĳ": 14955, "church": 14956, "hates</w>": 14957, "captain": 14958, "breathing</w>": 14959, "cet</w>": 14960, "fairly</w>": 14961, "approaches</w>": 14962, "companion</w>": 14963, "surprising</w>": 14964, "kanye</w>": 14965, "pey": 14966, "hindi</w>": 14967, "targeted</w>": 14968, "lords</w>": 14969, "deut": 14970, "digging</w>": 14971, "german": 14972, "rut": 14973, "energy": 14974, "closest</w>": 14975, "yun": 14976, "apologi": 14977, "à¸±</w>": 14978, "sack</w>": 14979, "rup</w>": 14980, "ddy": 14981, "portal</w>": 14982, "dough</w>": 14983, "bats</w>": 14984, "ðŁĵ°</w>": 14985, "atur": 14986, "grapher</w>": 14987, "pires</w>": 14988, "motors</w>": 14989, "ðŁĮ¹": 14990, "jc": 14991, "dang</w>": 14992, "tuk</w>": 14993, "clue</w>": 14994, "usc</w>": 14995, "page": 14996, "dless</w>": 14997, "brows</w>": 14998, "jus</w>": 14999, "ading</w>": 15000, "remarks</w>": 15001, "oom</w>": 15002, "cardio": 15003, "stefan": 15004, "armstrong</w>": 15005, "âĢ¢âĢ¢": 15006, "niest</w>": 15007, "belgian</w>": 15008, "biop": 15009, "soy</w>": 15010, "lof": 15011, "íĥ": 15012, "qt</w>": 15013, "flashbackfriday</w>": 15014, "cee</w>": 15015, "ģà¸": 15016, "wreck</w>": 15017, "marines</w>": 15018, "amendment</w>": 15019, "wardrobe</w>": 15020, "voy</w>": 15021, "burned</w>": 15022, "guitars</w>": 15023, "rainf": 15024, "lifel": 15025, "ssil</w>": 15026, "ounce</w>": 15027, "external</w>": 15028, "ckey": 15029, "mesh</w>": 15030, "sheikh</w>": 15031, "invitation</w>": 15032, "suggesti": 15033, "popcorn</w>": 15034, "phenomenal</w>": 15035, "anonymous</w>": 15036, "tuna</w>": 15037, "chicago": 15038, "oval</w>": 15039, "dely</w>": 15040, "locals</w>": 15041, "(&</w>": 15042, "prof": 15043, "novel": 15044, "finder</w>": 15045, "sparks</w>": 15046, "laven": 15047, "infu": 15048, "nicks</w>": 15049, "quant": 15050, "rae</w>": 15051, "exec</w>": 15052, "distingui": 15053, "stances</w>": 15054, "mutual</w>": 15055, "shal": 15056, "unveils</w>": 15057, "edmonton</w>": 15058, "zania</w>": 15059, "adio</w>": 15060, "viewer</w>": 15061, "bradford</w>": 15062, "auditorium</w>": 15063, "quis": 15064, "react</w>": 15065, "http</w>": 15066, "lero": 15067, "cheeky</w>": 15068, "impacts</w>": 15069, "tak": 15070, "edt</w>": 15071, "desperate</w>": 15072, "tay</w>": 15073, "ìĦ": 15074, "settle</w>": 15075, "bargain</w>": 15076, "resume</w>": 15077, "unite": 15078, "thrown</w>": 15079, "kest</w>": 15080, "seys</w>": 15081, "marching</w>": 15082, "amit": 15083, "decline</w>": 15084, "schar": 15085, "metr": 15086, "stanford</w>": 15087, "linke": 15088, "berra</w>": 15089, "dolls</w>": 15090, "rugby": 15091, "jami": 15092, "bor</w>": 15093, "roadtrip</w>": 15094, "dinosaur</w>": 15095, "mik": 15096, "sunder": 15097, "rem": 15098, "bk": 15099, "overseas</w>": 15100, "naughty</w>": 15101, "implementation</w>": 15102, "iamsrk</w>": 15103, "luncheon</w>": 15104, "firing</w>": 15105, "miami": 15106, "perez</w>": 15107, "thee</w>": 15108, "zon": 15109, "gifted</w>": 15110, "conversion</w>": 15111, "ceramic</w>": 15112, "¡ï¸ı": 15113, "pedro</w>": 15114, "ìĨ": 15115, "vick": 15116, "!@</w>": 15117, "heed</w>": 15118, "sid</w>": 15119, "bw</w>": 15120, "document</w>": 15121, "plun": 15122, "grants</w>": 15123, "fantasy": 15124, "predictions</w>": 15125, "valid</w>": 15126, "carved</w>": 15127, "graduated</w>": 15128, "ðŁĳįðŁı»</w>": 15129, "nationally</w>": 15130, "chy": 15131, "afl": 15132, "resso</w>": 15133, "blank</w>": 15134, "rivals</w>": 15135, "jig": 15136, "eties</w>": 15137, "omics</w>": 15138, "unemp": 15139, "bound": 15140, "sko": 15141, "inspection</w>": 15142, "paral": 15143, "highs</w>": 15144, "crisp</w>": 15145, "bans</w>": 15146, "oba</w>": 15147, "[@</w>": 15148, "cospla": 15149, "costumes</w>": 15150, "recall</w>": 15151, "mouth": 15152, "nigel</w>": 15153, "bts": 15154, "tera</w>": 15155, "kov</w>": 15156, "docs</w>": 15157, "westminster</w>": 15158, "dict</w>": 15159, "gravity</w>": 15160, "kari</w>": 15161, "rogue</w>": 15162, "tted</w>": 15163, "wark</w>": 15164, "idaho</w>": 15165, "wend": 15166, "awi</w>": 15167, "queensland</w>": 15168, "processes</w>": 15169, "cliffe</w>": 15170, "mick": 15171, "compens": 15172, "opol": 15173, "they": 15174, "clari": 15175, "wikipedia</w>": 15176, "salmankhan</w>": 15177, "hazard</w>": 15178, "preston</w>": 15179, "sweetest</w>": 15180, "pdf</w>": 15181, "chees": 15182, "trilo": 15183, "southafrica</w>": 15184, "burnt</w>": 15185, "($</w>": 15186, "contain</w>": 15187, "tp</w>": 15188, "submitted</w>": 15189, "soundcloud</w>": 15190, "atu": 15191, "rez</w>": 15192, "wordpress</w>": 15193, "corrupt</w>": 15194, "nf": 15195, "maker": 15196, "íķ": 15197, "paras": 15198, "advent</w>": 15199, "rial</w>": 15200, "cafe": 15201, "fossil</w>": 15202, "!!!!!!!</w>": 15203, "cows</w>": 15204, "cj</w>": 15205, "spur": 15206, "institutions</w>": 15207, "landmark</w>": 15208, "entit": 15209, "reut": 15210, "his": 15211, "alzheim": 15212, "wemb": 15213, "reggae</w>": 15214, "mosqu": 15215, "stat</w>": 15216, "identified</w>": 15217, "dealer</w>": 15218, "ream</w>": 15219, "reland</w>": 15220, "tension</w>": 15221, "ðŁĩ©": 15222, "wrapping</w>": 15223, "deeper</w>": 15224, "frat": 15225, "reddit</w>": 15226, "aris</w>": 15227, "morocco</w>": 15228, "..\"</w>": 15229, "blow": 15230, "mapping</w>": 15231, "priorities</w>": 15232, "inga</w>": 15233, "swap</w>": 15234, "rewards</w>": 15235, "conspiracy</w>": 15236, "creative": 15237, "cj": 15238, "congressional</w>": 15239, "vault</w>": 15240, "plex</w>": 15241, "sophomore</w>": 15242, "shadow": 15243, "eless": 15244, "ðŁĺħ": 15245, "darts</w>": 15246, "aldub</w>": 15247, "annoying</w>": 15248, "props</w>": 15249, "nas</w>": 15250, "aluminum</w>": 15251, "hbo</w>": 15252, "offense</w>": 15253, "jill</w>": 15254, "onions</w>": 15255, "laur": 15256, "tae": 15257, "hardest</w>": 15258, "shro": 15259, "gaining</w>": 15260, "measure": 15261, "edtech</w>": 15262, "cyprus</w>": 15263, "tara</w>": 15264, "angeli": 15265, "carlo</w>": 15266, "goon</w>": 15267, "alli</w>": 15268, "implic": 15269, "jupit": 15270, "resilience</w>": 15271, "hail": 15272, "balanced</w>": 15273, ")...</w>": 15274, "joyce</w>": 15275, "gra</w>": 15276, "theli": 15277, "defined</w>": 15278, "shipped</w>": 15279, "mainly</w>": 15280, "mina</w>": 15281, "lm</w>": 15282, "sacri": 15283, "ober": 15284, "pim": 15285, "claiming</w>": 15286, "enters</w>": 15287, "corey</w>": 15288, "bok</w>": 15289, "cried</w>": 15290, "cooling</w>": 15291, "danielle</w>": 15292, "pharmacy</w>": 15293, "thorough": 15294, "cake": 15295, "klo": 15296, "outreach</w>": 15297, "zens</w>": 15298, "digitalmarketing</w>": 15299, "valent</w>": 15300, "snp</w>": 15301, "herb</w>": 15302, "mrw</w>": 15303, "cafÃ©</w>": 15304, "captures</w>": 15305, "notre</w>": 15306, "triumph</w>": 15307, "pancakes</w>": 15308, "cumber": 15309, "spike</w>": 15310, "dation</w>": 15311, "bigg": 15312, "sper</w>": 15313, "critical": 15314, "amal": 15315, "tooth": 15316, "founding</w>": 15317, "astro</w>": 15318, "'#</w>": 15319, "quantum</w>": 15320, "thames</w>": 15321, "unc</w>": 15322, "pride": 15323, "airbus</w>": 15324, "knocked</w>": 15325, "undefeated</w>": 15326, "mediterranean</w>": 15327, "calcu": 15328, "clown</w>": 15329, "sensor</w>": 15330, "hammer": 15331, "forgive</w>": 15332, "cushi": 15333, "berry": 15334, "majestic</w>": 15335, "elect</w>": 15336, "politan</w>": 15337, "gta</w>": 15338, "kari": 15339, "burke</w>": 15340, "seahawks</w>": 15341, "volkswagen</w>": 15342, "rei": 15343, "landscapes</w>": 15344, "casu": 15345, "grandfather</w>": 15346, "listened</w>": 15347, "//": 15348, "startrek</w>": 15349, "rainfall</w>": 15350, "furry</w>": 15351, "vier": 15352, "stark</w>": 15353, "rifle</w>": 15354, "ffa</w>": 15355, "leges</w>": 15356, "hillaryclinton</w>": 15357, "minus</w>": 15358, "correctly</w>": 15359, "architectural</w>": 15360, "prece": 15361, "upside</w>": 15362, "boxer</w>": 15363, "ðŁĻĮðŁı¼</w>": 15364, "isai": 15365, "det</w>": 15366, "provo": 15367, "tissue</w>": 15368, "spooky</w>": 15369, "veled</w>": 15370, "recon": 15371, "prospects</w>": 15372, "quebec</w>": 15373, "âļ«": 15374, "igno": 15375, "anatomy</w>": 15376, "shapes</w>": 15377, "wp": 15378, "pinterest</w>": 15379, "hore</w>": 15380, "anes</w>": 15381, "pickup</w>": 15382, "tip": 15383, "pradesh</w>": 15384, "hugh</w>": 15385, "coe</w>": 15386, "pok": 15387, "grammy</w>": 15388, "wellington</w>": 15389, "stigate</w>": 15390, "righ": 15391, "leap</w>": 15392, "kingston</w>": 15393, "scenic</w>": 15394, "gosh</w>": 15395, "vani": 15396, "aug": 15397, "sary</w>": 15398, "zier</w>": 15399, "bureau</w>": 15400, "linson</w>": 15401, "conte": 15402, "fragr": 15403, "allan</w>": 15404, "gaw": 15405, "lana</w>": 15406, "collision</w>": 15407, "surveill": 15408, "renais": 15409, "arrange": 15410, "sali": 15411, "doin</w>": 15412, "brance</w>": 15413, "brendan</w>": 15414, "ourse</w>": 15415, "incoming</w>": 15416, "suspension</w>": 15417, "à´": 15418, "lla</w>": 15419, "educators</w>": 15420, "intri": 15421, "dae</w>": 15422, "biography</w>": 15423, "bulgar": 15424, "villain</w>": 15425, "gothic</w>": 15426, "rwanda</w>": 15427, "ew</w>": 15428, "mayor": 15429, "meetup</w>": 15430, "democrat</w>": 15431, "morgan": 15432, "sudden</w>": 15433, "tesco</w>": 15434, "carrot</w>": 15435, "bomber</w>": 15436, "mckin": 15437, "rene": 15438, "funday</w>": 15439, "agricultural</w>": 15440, "hahah</w>": 15441, "showtime</w>": 15442, "forming</w>": 15443, "cola</w>": 15444, "scorpi": 15445, "quote": 15446, "poppy</w>": 15447, "slife</w>": 15448, "daz": 15449, "tub</w>": 15450, "nen</w>": 15451, "mot</w>": 15452, "ðŁĺ»": 15453, "sore</w>": 15454, "elderly</w>": 15455, "ove</w>": 15456, "skinny</w>": 15457, "umi</w>": 15458, "anco</w>": 15459, "manship</w>": 15460, "were": 15461, "gv": 15462, "kah</w>": 15463, "folding</w>": 15464, "neat</w>": 15465, "samantha</w>": 15466, "danish</w>": 15467, "ukrain": 15468, "humidity</w>": 15469, "nutri": 15470, "jakarta</w>": 15471, "candles</w>": 15472, "oooooooo": 15473, "atile</w>": 15474, "strength": 15475, "ibra": 15476, "bapti": 15477, "charleston</w>": 15478, "frames</w>": 15479, "girls": 15480, "clearing</w>": 15481, "gluten": 15482, "##</w>": 15483, "supernatural</w>": 15484, "jubi": 15485, "phone": 15486, "hein": 15487, "drun": 15488, "leak</w>": 15489, "investor</w>": 15490, "yer": 15491, "domain</w>": 15492, "ballroom</w>": 15493, "mish": 15494, "appli": 15495, "offshore</w>": 15496, "blaze</w>": 15497, "doro": 15498, "âĺķï¸ı</w>": 15499, "winery</w>": 15500, "sharif</w>": 15501, "adore</w>": 15502, "nir": 15503, "safer</w>": 15504, "sigh</w>": 15505, "ascri": 15506, "strongly</w>": 15507, "tracy</w>": 15508, "cker": 15509, "oll</w>": 15510, "faithful</w>": 15511, "eyed</w>": 15512, "delightful</w>": 15513, "vism</w>": 15514, "karnataka</w>": 15515, "titan</w>": 15516, "whar": 15517, "jerseys</w>": 15518, "refur": 15519, "heaven": 15520, "grip</w>": 15521, "panama</w>": 15522, "preli": 15523, "gluten</w>": 15524, "odd": 15525, "content": 15526, "ponti": 15527, "tioning</w>": 15528, "ecommerce</w>": 15529, "federation</w>": 15530, "flawless</w>": 15531, "gear": 15532, "tires</w>": 15533, "byr": 15534, "police": 15535, "cuban</w>": 15536, "tributes</w>": 15537, "ticul": 15538, "churches</w>": 15539, "nursery</w>": 15540, "diaries</w>": 15541, "museums</w>": 15542, "snapped</w>": 15543, "ivan": 15544, "wight</w>": 15545, "tourists</w>": 15546, "ramadan</w>": 15547, "trent</w>": 15548, "prophet</w>": 15549, "wondered</w>": 15550, "focusing</w>": 15551, "hid": 15552, "icons</w>": 15553, "iq": 15554, "ambulance</w>": 15555, "pist": 15556, "funniest</w>": 15557, "timeless</w>": 15558, "srilan": 15559, "buys</w>": 15560, "kids": 15561, "colourful</w>": 15562, "ashi": 15563, "chir": 15564, "mum": 15565, "ðŁĵļ</w>": 15566, "letter": 15567, "xen": 15568, "reuters</w>": 15569, "preserve</w>": 15570, "inting</w>": 15571, "step": 15572, "fuji": 15573, "univer": 15574, "iu</w>": 15575, "showdown</w>": 15576, "poems</w>": 15577, "surveillance</w>": 15578, "suspected</w>": 15579, "tae</w>": 15580, "solving</w>": 15581, "tomb</w>": 15582, "mothersday</w>": 15583, "carpen": 15584, "recruit</w>": 15585, "pilots</w>": 15586, "broc": 15587, "mixing</w>": 15588, "fridays</w>": 15589, "tyr": 15590, "representatives</w>": 15591, "trapped</w>": 15592, "abdul</w>": 15593, "freestyle</w>": 15594, "cluster</w>": 15595, "âļłï¸ı</w>": 15596, "kd</w>": 15597, "skill": 15598, "pitt</w>": 15599, "exo": 15600, "commerci": 15601, "museum": 15602, "locally</w>": 15603, "gina</w>": 15604, "nobel</w>": 15605, "immune</w>": 15606, "frac": 15607, "capsu": 15608, "mained": 15609, "attempts</w>": 15610, "bulldog</w>": 15611, "bespoke</w>": 15612, "singers</w>": 15613, "spelling</w>": 15614, "segment</w>": 15615, "natures</w>": 15616, "tick</w>": 15617, "lipstick</w>": 15618, "cleaner</w>": 15619, "gettable</w>": 15620, "precision</w>": 15621, "âĢ¼ï¸ı": 15622, "thood</w>": 15623, "reef</w>": 15624, "nope</w>": 15625, "billy": 15626, "digi": 15627, "musi": 15628, "rival</w>": 15629, "figured</w>": 15630, "tality</w>": 15631, "sunny": 15632, "berk": 15633, "awww</w>": 15634, "awaits</w>": 15635, "unreal</w>": 15636, "copen": 15637, "asylum</w>": 15638, "exotic</w>": 15639, "buen": 15640, "mock": 15641, "enable</w>": 15642, "archy</w>": 15643, "fra</w>": 15644, "plastic": 15645, "almond</w>": 15646, "ampli": 15647, "displays</w>": 15648, "abbott</w>": 15649, "sme</w>": 15650, "xp</w>": 15651, "ðŁĻĥ</w>": 15652, "graphic": 15653, "ived</w>": 15654, "mara</w>": 15655, "caution</w>": 15656, "leaks</w>": 15657, "enberg</w>": 15658, "ulu</w>": 15659, "unicorn</w>": 15660, "cannon</w>": 15661, "apprentic": 15662, "ðŁĺĺðŁĺĺ": 15663, "bball</w>": 15664, "willow</w>": 15665, "atics</w>": 15666, "amas</w>": 15667, "manufacturer</w>": 15668, "campaigns</w>": 15669, "porters</w>": 15670, "floors</w>": 15671, "lsu</w>": 15672, "type": 15673, "kej": 15674, "honorary</w>": 15675, "itim": 15676, "tole": 15677, "minecraft</w>": 15678, "dx</w>": 15679, "mash</w>": 15680, "rio": 15681, "consequences</w>": 15682, "ronald</w>": 15683, "gossi": 15684, "suffolk</w>": 15685, "muse</w>": 15686, "rbi</w>": 15687, "livemusic</w>": 15688, "ivan</w>": 15689, "ðŁİ¤</w>": 15690, "leu": 15691, "patriot</w>": 15692, "manit": 15693, "lanca": 15694, "homedecor</w>": 15695, "dear": 15696, "sigma</w>": 15697, "tide": 15698, "strings</w>": 15699, "vita</w>": 15700, "sequel</w>": 15701, "tryna</w>": 15702, "investigate</w>": 15703, "boris</w>": 15704, "vegan": 15705, "barrier</w>": 15706, "mindfulness</w>": 15707, "webb</w>": 15708, "hustle</w>": 15709, "inda</w>": 15710, "tanzania</w>": 15711, "stray</w>": 15712, "texas": 15713, "cag": 15714, "diagnosis</w>": 15715, "woman": 15716, "gw</w>": 15717, "obsession</w>": 15718, "lative</w>": 15719, "nufc</w>": 15720, "flynn</w>": 15721, "momentum</w>": 15722, "sofa</w>": 15723, "wald</w>": 15724, "vegetable</w>": 15725, "tucker</w>": 15726, "supper</w>": 15727, "seab": 15728, "arro": 15729, "seag": 15730, "venting</w>": 15731, "councill": 15732, "splat": 15733, "calcul": 15734, "..#</w>": 15735, "comfy</w>": 15736, "odisha</w>": 15737, "stopp": 15738, "warfare</w>": 15739, "caes": 15740, "à¨": 15741, "coy</w>": 15742, "priceless</w>": 15743, "insec": 15744, "ðŁĺĽ</w>": 15745, "controls</w>": 15746, "empowerment</w>": 15747, "datascience</w>": 15748, "perpe": 15749, "genic</w>": 15750, "eres</w>": 15751, "trudeau</w>": 15752, "mano": 15753, "slavery</w>": 15754, "expanding</w>": 15755, "mahe": 15756, "failing</w>": 15757, "saga</w>": 15758, "photographs</w>": 15759, "crest</w>": 15760, "reon</w>": 15761, "surfing</w>": 15762, "hie</w>": 15763, "ðŁįĢ</w>": 15764, "jae</w>": 15765, "fellows</w>": 15766, "southampton</w>": 15767, "solom": 15768, "cester": 15769, "tability</w>": 15770, "horn": 15771, "sect</w>": 15772, "hee</w>": 15773, "coleman</w>": 15774, "atlas</w>": 15775, "explorer</w>": 15776, "consultation</w>": 15777, "copyright</w>": 15778, "organizing</w>": 15779, "denied</w>": 15780, "monkeys</w>": 15781, "noodles</w>": 15782, "bris</w>": 15783, "flor": 15784, "dough": 15785, "bonds</w>": 15786, "shocked</w>": 15787, "ecosystem</w>": 15788, "carefully</w>": 15789, "wm</w>": 15790, "apartments</w>": 15791, "curve</w>": 15792, "sandiego</w>": 15793, "mustard</w>": 15794, "commen": 15795, "ceremon": 15796, "ech": 15797, "ruth": 15798, "ðŁĻĮðŁı»</w>": 15799, "hawai": 15800, "filmed</w>": 15801, "tear": 15802, "asingly</w>": 15803, "cair": 15804, "watt</w>": 15805, "instrument</w>": 15806, "outta</w>": 15807, "yeol</w>": 15808, "riverside</w>": 15809, "ë°": 15810, ".:</w>": 15811, "norwich</w>": 15812, "alog</w>": 15813, "migrants</w>": 15814, "newman</w>": 15815, "ride": 15816, "sprink": 15817, "targeting</w>": 15818, "believe": 15819, "torch</w>": 15820, "reflects</w>": 15821, "permission</w>": 15822, "ffman</w>": 15823, "enemies</w>": 15824, "basics</w>": 15825, "seized</w>": 15826, "sundays</w>": 15827, "lei": 15828, "hassan</w>": 15829, "endo</w>": 15830, "hc": 15831, "stad": 15832, "lements</w>": 15833, "kkkk": 15834, "nano": 15835, "shark": 15836, "mana</w>": 15837, "onic": 15838, "treatments</w>": 15839, "early": 15840, "collaborative</w>": 15841, "shuttle</w>": 15842, "branches</w>": 15843, "misses</w>": 15844, "mainedcm</w>": 15845, "apers</w>": 15846, "kyle": 15847, "carrie</w>": 15848, "leisure</w>": 15849, "shet": 15850, "birding</w>": 15851, "advances</w>": 15852, "ðŁĵĿ</w>": 15853, "popular": 15854, "diane</w>": 15855, "abe": 15856, "rewar": 15857, "neighbour": 15858, "kpop</w>": 15859, "remembrance</w>": 15860, "playground</w>": 15861, "rub": 15862, "krishna</w>": 15863, "ebola</w>": 15864, "inquiry</w>": 15865, "epa</w>": 15866, "lumin": 15867, "organisation</w>": 15868, "abraham</w>": 15869, "normally</w>": 15870, "preten": 15871, "janet</w>": 15872, "wt": 15873, "ðŁĴİ</w>": 15874, "encouraging</w>": 15875, "astic</w>": 15876, "bump</w>": 15877, "sydney": 15878, "sz</w>": 15879, "ssss</w>": 15880, "garrett</w>": 15881, "ðŁĵ»</w>": 15882, "consulting</w>": 15883, "romania</w>": 15884, "spotting</w>": 15885, "chancellor</w>": 15886, "arma": 15887, "prestigious</w>": 15888, "ðĿĲ": 15889, "tad": 15890, "cryst": 15891, "competit": 15892, "ratio</w>": 15893, "cataly": 15894, "brow</w>": 15895, "jur": 15896, "viking</w>": 15897, "commute</w>": 15898, "yday</w>": 15899, "layers</w>": 15900, "dumb": 15901, "escal": 15902, "genocide</w>": 15903, "fill": 15904, "gupta</w>": 15905, "stepping</w>": 15906, "sei</w>": 15907, "foto": 15908, "wildcats</w>": 15909, "coli</w>": 15910, "project": 15911, "earnings</w>": 15912, "str</w>": 15913, "geons</w>": 15914, "completion</w>": 15915, "bm</w>": 15916, "decorated</w>": 15917, "crawford</w>": 15918, "afghan</w>": 15919, "scare</w>": 15920, "visibility</w>": 15921, "hib": 15922, "direction": 15923, "stroll</w>": 15924, "christina</w>": 15925, "alternate</w>": 15926, "clare</w>": 15927, "stylist</w>": 15928, "behold</w>": 15929, "sance</w>": 15930, "leopard</w>": 15931, "acquired</w>": 15932, "narrative</w>": 15933, "ashi</w>": 15934, "thea": 15935, "????": 15936, "peas</w>": 15937, "atch</w>": 15938, "slides</w>": 15939, "leen</w>": 15940, "renewable</w>": 15941, "english": 15942, "quir": 15943, "coaster</w>": 15944, "rx</w>": 15945, "fools</w>": 15946, "matchday</w>": 15947, "mism</w>": 15948, "amazing": 15949, "zig": 15950, "keting</w>": 15951, "wont</w>": 15952, "towel</w>": 15953, "diab": 15954, "stake": 15955, "nm": 15956, "melt</w>": 15957, "ethan</w>": 15958, "grape</w>": 15959, "politician</w>": 15960, "smen</w>": 15961, "íĺ": 15962, "reo": 15963, "weddings</w>": 15964, "catcher</w>": 15965, "oracle</w>": 15966, "memo": 15967, "ðŁĮ´</w>": 15968, "eck</w>": 15969, "robbie</w>": 15970, "norwegian</w>": 15971, "operator</w>": 15972, "amor</w>": 15973, "sewing</w>": 15974, "jul</w>": 15975, "xie</w>": 15976, "uv</w>": 15977, "fifty</w>": 15978, "mega": 15979, "tattoo": 15980, "liberals</w>": 15981, "upri": 15982, "trafficking</w>": 15983, "richardson</w>": 15984, "suv</w>": 15985, "kip</w>": 15986, "messy</w>": 15987, "tremendous</w>": 15988, "glou": 15989, "courtney</w>": 15990, "lad": 15991, "stereo": 15992, "myers</w>": 15993, "idio": 15994, "^_^</w>": 15995, "manning</w>": 15996, "dye</w>": 15997, "wd": 15998, "throne</w>": 15999, "junk</w>": 16000, "asu</w>": 16001, "provincial</w>": 16002, "kook</w>": 16003, "wrc</w>": 16004, "fineart</w>": 16005, "hampshire</w>": 16006, "renaissance</w>": 16007, "bred</w>": 16008, "fallout</w>": 16009, "sj</w>": 16010, "snl</w>": 16011, "alam</w>": 16012, "torture</w>": 16013, "fyi</w>": 16014, "shines</w>": 16015, "paw</w>": 16016, "char</w>": 16017, "henry": 16018, "crow</w>": 16019, "acious</w>": 16020, "dian": 16021, "paige</w>": 16022, "bare": 16023, "stockholm</w>": 16024, "scenery</w>": 16025, "ðŁĩ·": 16026, "jeffrey</w>": 16027, "push": 16028, "decoration</w>": 16029, "ned": 16030, "cute": 16031, "brigade</w>": 16032, "lavender</w>": 16033, "invites</w>": 16034, "esports</w>": 16035, "voir</w>": 16036, "dried</w>": 16037, "transpl": 16038, "surgeon</w>": 16039, "novels</w>": 16040, "pulls</w>": 16041, "sony": 16042, "lunar</w>": 16043, "mane</w>": 16044, "ivy</w>": 16045, "frustr": 16046, "dorset</w>": 16047, "sai": 16048, "torres</w>": 16049, "ssion": 16050, "shutdown</w>": 16051, "suggestions</w>": 16052, "writing": 16053, "eo": 16054, "battlefield</w>": 16055, "uga</w>": 16056, "ðŁĲ¾": 16057, "vacu": 16058, "splac": 16059, "git": 16060, "ug</w>": 16061, "highland</w>": 16062, "%)</w>": 16063, "mermaid</w>": 16064, "sacramento</w>": 16065, "tails</w>": 16066, "pw</w>": 16067, "kah": 16068, "tell": 16069, "enhanced</w>": 16070, "ìķ": 16071, "auckland</w>": 16072, "cruel": 16073, "ðŁ¤©</w>": 16074, "audre": 16075, "sailor</w>": 16076, "grammar</w>": 16077, "glove</w>": 16078, "deon</w>": 16079, "inflam": 16080, "freshly</w>": 16081, "kell": 16082, "zip</w>": 16083, "christie</w>": 16084, "mild</w>": 16085, "dixon</w>": 16086, "instructor</w>": 16087, "gence</w>": 16088, "ãħł": 16089, "subjec": 16090, "constitutional</w>": 16091, "crowds</w>": 16092, "invisible</w>": 16093, "ruins</w>": 16094, "dak</w>": 16095, "sip</w>": 16096, "plaque</w>": 16097, "pouring</w>": 16098, "complex": 16099, "zine</w>": 16100, "stead": 16101, "flet": 16102, "transmission</w>": 16103, "loway</w>": 16104, "arun": 16105, "increasingly</w>": 16106, "aud": 16107, "transparen": 16108, "crowned</w>": 16109, "scoun": 16110, "blizzard</w>": 16111, "luxu": 16112, "fiers</w>": 16113, "achievements</w>": 16114, "hunters</w>": 16115, "rocked</w>": 16116, "basin</w>": 16117, "violet</w>": 16118, "proves</w>": 16119, "achieving</w>": 16120, "prosper": 16121, "sega</w>": 16122, "float</w>": 16123, "vian</w>": 16124, "xiv</w>": 16125, "polic": 16126, "tura</w>": 16127, "approximately</w>": 16128, "wanderlust</w>": 16129, "keepers</w>": 16130, "getaway</w>": 16131, "cod": 16132, "polis</w>": 16133, "bryan": 16134, "colts</w>": 16135, "talents</w>": 16136, "yogur": 16137, "glutenfree</w>": 16138, "wrist</w>": 16139, "gry": 16140, "czech</w>": 16141, "ðŁİĪ": 16142, "eville</w>": 16143, "ðŁıĪ": 16144, "tox</w>": 16145, "daniels</w>": 16146, "amer</w>": 16147, "bids</w>": 16148, "weareone": 16149, "metab": 16150, "gt": 16151, "boyz</w>": 16152, "pdx</w>": 16153, "possession</w>": 16154, "pushed</w>": 16155, "shrine</w>": 16156, "realistic</w>": 16157, "trigger</w>": 16158, "navi": 16159, "rumors</w>": 16160, "naf": 16161, "jenkins</w>": 16162, "trun": 16163, "communi": 16164, "ÃĹ</w>": 16165, "gamers</w>": 16166, "armor</w>": 16167, "mohammed</w>": 16168, "balcony</w>": 16169, "yah": 16170, "strongest</w>": 16171, "rhythm</w>": 16172, "unforgettable</w>": 16173, "kp": 16174, "hobb": 16175, "custody</w>": 16176, "gregor</w>": 16177, "rita</w>": 16178, "aesthetic</w>": 16179, "ilation</w>": 16180, "sponsoring</w>": 16181, "nay</w>": 16182, "kidnapp": 16183, "shs</w>": 16184, "rajas": 16185, "meg</w>": 16186, "significantly</w>": 16187, "buttons</w>": 16188, "lac</w>": 16189, "versions</w>": 16190, "essentials</w>": 16191, "opinions</w>": 16192, "kro": 16193, "dprinting</w>": 16194, "widely</w>": 16195, "dk</w>": 16196, "uran</w>": 16197, "yal": 16198, "requested</w>": 16199, "cn</w>": 16200, "curric": 16201, "plum</w>": 16202, "grun": 16203, "vm": 16204, "devon": 16205, "myo": 16206, "relation</w>": 16207, "juventus</w>": 16208, "rouge</w>": 16209, "minority</w>": 16210, "mines</w>": 16211, "jupiter</w>": 16212, "nine": 16213, "oxygen</w>": 16214, "frankie</w>": 16215, "unesco</w>": 16216, "fabric": 16217, "disgusting</w>": 16218, "salman</w>": 16219, "detection</w>": 16220, "lanka</w>": 16221, "dac": 16222, "ðŁĩ«ðŁĩ·</w>": 16223, "argument</w>": 16224, "shelves</w>": 16225, "celtics</w>": 16226, "roberto</w>": 16227, "pigs</w>": 16228, "hedge": 16229, "faul": 16230, "powering</w>": 16231, "butterflies</w>": 16232, "fir</w>": 16233, "remake</w>": 16234, "atti</w>": 16235, "como</w>": 16236, "empha": 16237, "kendall</w>": 16238, "pokemon": 16239, "seating</w>": 16240, "dans</w>": 16241, "baldwin</w>": 16242, "ðŁĳ»</w>": 16243, "leslie</w>": 16244, "onedirection</w>": 16245, "timber</w>": 16246, "iman</w>": 16247, "font</w>": 16248, "eder</w>": 16249, "dion</w>": 16250, "steph</w>": 16251, "format": 16252, "gregory</w>": 16253, "prop</w>": 16254, "hex": 16255, "ruin</w>": 16256, "sory</w>": 16257, "infer": 16258, "naw": 16259, "barak</w>": 16260, "sdgs</w>": 16261, "karao": 16262, "lush</w>": 16263, "vander": 16264, "endent</w>": 16265, "gis": 16266, "afro": 16267, "soccer": 16268, "ayan</w>": 16269, "tuni": 16270, "lung</w>": 16271, "dayof": 16272, "alexa</w>": 16273, "marath": 16274, "addicted</w>": 16275, "agile</w>": 16276, "hygi": 16277, "lightweight</w>": 16278, "ì§": 16279, "mandela</w>": 16280, "joey": 16281, "ancy": 16282, "hum</w>": 16283, "bir</w>": 16284, "memorial": 16285, "jimin</w>": 16286, "ginger": 16287, "vak": 16288, "javascri": 16289, "crops</w>": 16290, "origins</w>": 16291, "dari": 16292, "piper</w>": 16293, "import</w>": 16294, "aggressive</w>": 16295, "prediction</w>": 16296, "repairs</w>": 16297, "cracker</w>": 16298, "voyage</w>": 16299, "nike": 16300, "mummy</w>": 16301, "linkedin</w>": 16302, "countryside</w>": 16303, "border": 16304, "glass": 16305, "pert</w>": 16306, "sals</w>": 16307, "shoe": 16308, "autographed</w>": 16309, "walnut</w>": 16310, "collegi": 16311, "salary</w>": 16312, "pairing</w>": 16313, "ðŁĮ¸": 16314, "cathol": 16315, "sweethe": 16316, "defeats</w>": 16317, "strengthen</w>": 16318, "rooftop</w>": 16319, "improvements</w>": 16320, "barriers</w>": 16321, "uru": 16322, "tally</w>": 16323, "ruled</w>": 16324, "ðŁĨļ</w>": 16325, "naija</w>": 16326, "emoji</w>": 16327, "percent": 16328, "gio": 16329, "probs</w>": 16330, "once": 16331, "admits</w>": 16332, "paths</w>": 16333, "liar</w>": 16334, "daytona</w>": 16335, "peters</w>": 16336, "cali</w>": 16337, "calli": 16338, "mug": 16339, "osa": 16340, "aph": 16341, "aby": 16342, "hyde</w>": 16343, "ethnic</w>": 16344, "plains</w>": 16345, "olf</w>": 16346, "hahahahaha</w>": 16347, "holic</w>": 16348, "?!?!</w>": 16349, "subli": 16350, "blacks</w>": 16351, "mot": 16352, "ghton</w>": 16353, "lovin</w>": 16354, "brent</w>": 16355, "baru</w>": 16356, "lati": 16357, "dew</w>": 16358, "ateau</w>": 16359, "qa</w>": 16360, "painful</w>": 16361, "busters</w>": 16362, "static</w>": 16363, "ðŁĩ¨ðŁĩ¦</w>": 16364, "notebook</w>": 16365, "outfits</w>": 16366, "sies</w>": 16367, "rf</w>": 16368, "floods</w>": 16369, "ÑĢ": 16370, "throat</w>": 16371, "suici": 16372, "rovers</w>": 16373, "bengal</w>": 16374, "prepares</w>": 16375, "blog": 16376, "miniature</w>": 16377, "Ø¨": 16378, "amphi": 16379, "comb</w>": 16380, "rsp": 16381, "intimate</w>": 16382, "greene</w>": 16383, "Ìĩ</w>": 16384, "altar</w>": 16385, "surgical</w>": 16386, "vessel</w>": 16387, "...?</w>": 16388, "gavin</w>": 16389, "gator</w>": 16390, "threatened</w>": 16391, "zar</w>": 16392, "robbery</w>": 16393, "dier</w>": 16394, "promoted</w>": 16395, "yg</w>": 16396, "xs</w>": 16397, "subs</w>": 16398, "interviewing</w>": 16399, "threatening</w>": 16400, "dozen</w>": 16401, "meado": 16402, "waterfall</w>": 16403, "nintendoswitch</w>": 16404, "calum</w>": 16405, "ministers</w>": 16406, "drop": 16407, "universities</w>": 16408, "warned</w>": 16409, "tactics</w>": 16410, "ðŁĩ²": 16411, "refuse</w>": 16412, "adju": 16413, "vast</w>": 16414, "ðŁĺ´</w>": 16415, "mcfc</w>": 16416, "libya</w>": 16417, "nofilter</w>": 16418, "distributed</w>": 16419, "reser": 16420, "ronnie</w>": 16421, "deco</w>": 16422, "javascript</w>": 16423, "monk</w>": 16424, "interests</w>": 16425, "flex": 16426, "martha</w>": 16427, "sties</w>": 16428, "ood": 16429, "ðŁ¤£ðŁ¤£": 16430, "eun": 16431, "bali": 16432, "gomez</w>": 16433, "stimul": 16434, "moderate</w>": 16435, "dity</w>": 16436, "iris</w>": 16437, "straw</w>": 16438, "consistent</w>": 16439, "directions</w>": 16440, "adopt": 16441, "salsa</w>": 16442, "croo": 16443, "recovered</w>": 16444, "blackfriday</w>": 16445, "lancaster</w>": 16446, "accept": 16447, "weareoneexo</w>": 16448, "builds</w>": 16449, "freeman</w>": 16450, "airplane</w>": 16451, "dition": 16452, "belong": 16453, "jamie": 16454, "pitching</w>": 16455, "lif": 16456, "omin": 16457, "crispy</w>": 16458, "prepping</w>": 16459, "veg</w>": 16460, "chang</w>": 16461, "accomplished</w>": 16462, "gracias</w>": 16463, "dolphin</w>": 16464, "elector": 16465, "culinary</w>": 16466, "superbowl</w>": 16467, "wala</w>": 16468, "pursuit</w>": 16469, "blackberry</w>": 16470, "bean": 16471, "cardinal</w>": 16472, "proved</w>": 16473, "immigrant</w>": 16474, "strictly</w>": 16475, "holocaust</w>": 16476, "passage</w>": 16477, "haus</w>": 16478, "coup</w>": 16479, "purse</w>": 16480, "harass": 16481, "<<</w>": 16482, "leed": 16483, "adobe</w>": 16484, "stad</w>": 16485, "legislat": 16486, "parked</w>": 16487, "priyan": 16488, "silva</w>": 16489, "krist": 16490, "sthe": 16491, "funky</w>": 16492, "iga</w>": 16493, "settlement</w>": 16494, "phs</w>": 16495, "tmrw</w>": 16496, "stressed</w>": 16497, "hunt": 16498, "hockey": 16499, "treasures</w>": 16500, "chambers</w>": 16501, "olu": 16502, "hut</w>": 16503, "marley</w>": 16504, "texture</w>": 16505, "wilderness</w>": 16506, "mming</w>": 16507, "potentially</w>": 16508, "omaha</w>": 16509, "judy</w>": 16510, "toes</w>": 16511, "spoiler</w>": 16512, "distinguished</w>": 16513, "felix</w>": 16514, "ahu</w>": 16515, "recommendations</w>": 16516, "zombies</w>": 16517, "hitler</w>": 16518, "triple": 16519, "collapse</w>": 16520, "motivated</w>": 16521, "ultimat": 16522, "ggling</w>": 16523, "soy": 16524, "cigar</w>": 16525, "foren": 16526, "vineyard</w>": 16527, "glitter</w>": 16528, "findings</w>": 16529, "colonial</w>": 16530, "hunter": 16531, "erik</w>": 16532, "dens</w>": 16533, "beetle</w>": 16534, "lotte": 16535, "subtle</w>": 16536, "smatter</w>": 16537, "trusted</w>": 16538, "experimental</w>": 16539, "naments</w>": 16540, "ðŁĺĨ": 16541, "region": 16542, "acquisition</w>": 16543, "breeding</w>": 16544, "quarterback</w>": 16545, "amreading</w>": 16546, "ootd</w>": 16547, "rude</w>": 16548, "initiatives</w>": 16549, "stout</w>": 16550, "hyung</w>": 16551, "outcome</w>": 16552, "alfred</w>": 16553, "mics</w>": 16554, "expertise</w>": 16555, "bacteria</w>": 16556, "penguins</w>": 16557, "jumper</w>": 16558, "valencia</w>": 16559, "bark</w>": 16560, "ingday</w>": 16561, "sellers</w>": 16562, "contracts</w>": 16563, "houston": 16564, "commissioned</w>": 16565, "adaptation</w>": 16566, "swansea</w>": 16567, "santiago</w>": 16568, "commonwealth</w>": 16569, "judging</w>": 16570, "submission</w>": 16571, "scorer</w>": 16572, "tommy": 16573, "Ã±o</w>": 16574, "exquis": 16575, "filing</w>": 16576, "explanation</w>": 16577, "allison</w>": 16578, "wembley</w>": 16579, "ridge": 16580, "chevy</w>": 16581, "santos</w>": 16582, "ownership</w>": 16583, "cognitive</w>": 16584, "favourites</w>": 16585, "shed": 16586, "philanthro": 16587, "deleted</w>": 16588, "godd": 16589, "snor": 16590, "guidelines</w>": 16591, "ffing</w>": 16592, "jeep": 16593, "clips</w>": 16594, "swamp</w>": 16595, "anor</w>": 16596, "guild</w>": 16597, "bolton</w>": 16598, "springfield</w>": 16599, "municipal</w>": 16600, "goalkeeper</w>": 16601, "yeon</w>": 16602, "ðŁĺįðŁĺįðŁĺįðŁĺį": 16603, "ãħĭãħĭ": 16604, "waterfront</w>": 16605, "grave": 16606, "contemporary": 16607, "arity</w>": 16608, "ÃŃa</w>": 16609, "sleeps</w>": 16610, "syrup</w>": 16611, "alam": 16612, "pire": 16613, "coyo": 16614, "motogp</w>": 16615, "tyson</w>": 16616, "kejri": 16617, "circul": 16618, "singly</w>": 16619, "crunch</w>": 16620, "complicated</w>": 16621, "nostalgia</w>": 16622, "kop": 16623, "move": 16624, "kale</w>": 16625, "macro</w>": 16626, "midwest</w>": 16627, "hans</w>": 16628, "tribal</w>": 16629, "nude</w>": 16630, "à¯į</w>": 16631, "beyonce</w>": 16632, "congratulate</w>": 16633, "cater": 16634, "league": 16635, "ðŁĻĬ</w>": 16636, "ladder</w>": 16637, "crashed</w>": 16638, "technic": 16639, "karaoke</w>": 16640, "harassment</w>": 16641, "rots</w>": 16642, "experiencing</w>": 16643, "kristen</w>": 16644, "ðŁĩ³": 16645, "ðŁ¤Ĺ": 16646, "reflections</w>": 16647, "guinness</w>": 16648, "illustrator</w>": 16649, "ðŁĻıðŁı»</w>": 16650, "center": 16651, "narrow</w>": 16652, "commons</w>": 16653, "regulations</w>": 16654, "ÙĨ": 16655, "harm": 16656, "croft</w>": 16657, "cussion</w>": 16658, "hongkong</w>": 16659, "stical</w>": 16660, "internship</w>": 16661, "zoe</w>": 16662, "chop</w>": 16663, "hoods</w>": 16664, "estimated</w>": 16665, "batteries</w>": 16666, "berkeley</w>": 16667, "smoothie</w>": 16668, "shaun</w>": 16669, "cros": 16670, "~~</w>": 16671, "campe": 16672, "hump": 16673, "bg": 16674, "prototype</w>": 16675, "click": 16676, "shawn": 16677, "reviewed</w>": 16678, "templ": 16679, "pf": 16680, "jedi</w>": 16681, "blogs</w>": 16682, "raymond</w>": 16683, "asth": 16684, "bah</w>": 16685, "avail</w>": 16686, "scotch</w>": 16687, "leafs</w>": 16688, "nikki</w>": 16689, "tok": 16690, "hollow</w>": 16691, "urges</w>": 16692, "oft</w>": 16693, "unlike</w>": 16694, "latin": 16695, "ue": 16696, "catering</w>": 16697, "mili": 16698, "alternati": 16699, "maver": 16700, "Ð¸": 16701, "agle</w>": 16702, "preorder</w>": 16703, "lux</w>": 16704, "cucu": 16705, "ðŁĳıðŁĳı</w>": 16706, "tart": 16707, "âĿ¤âĿ¤âĿ¤</w>": 16708, "arabic</w>": 16709, "rapidly</w>": 16710, "arrang": 16711, "allen": 16712, "traveltuesday</w>": 16713, "paws</w>": 16714, "flows</w>": 16715, "stability</w>": 16716, "fluid</w>": 16717, "capp": 16718, "canberra</w>": 16719, "uuuu": 16720, "spani": 16721, "demonstration</w>": 16722, "mla</w>": 16723, "placement</w>": 16724, "mw": 16725, "presidents</w>": 16726, "awesom": 16727, "beverly</w>": 16728, "anist</w>": 16729, "neal</w>": 16730, "fathersday</w>": 16731, "referendum</w>": 16732, "lahore</w>": 16733, "oaks</w>": 16734, "debbie</w>": 16735, "halfway</w>": 16736, "ghosts</w>": 16737, "debor": 16738, "matthews</w>": 16739, "fiat</w>": 16740, "tfw</w>": 16741, "presen": 16742, "robi</w>": 16743, "ded": 16744, "brock</w>": 16745, "laughed</w>": 16746, "amounts</w>": 16747, "bamboo</w>": 16748, "kindergarten</w>": 16749, "eaten</w>": 16750, "mtvhottest</w>": 16751, "breakout</w>": 16752, "usic</w>": 16753, "fraser</w>": 16754, "legislative</w>": 16755, "pang": 16756, "module</w>": 16757, "sammy</w>": 16758, "gover</w>": 16759, "earns</w>": 16760, "expedition</w>": 16761, "garh</w>": 16762, "concepts</w>": 16763, "charlie": 16764, "lava</w>": 16765, "bachelor</w>": 16766, "veggies</w>": 16767, "determine</w>": 16768, "ellie</w>": 16769, "unlocked</w>": 16770, "fruit": 16771, "dalla": 16772, "coupe</w>": 16773, "washington": 16774, "deposit</w>": 16775, "ivory</w>": 16776, "paula</w>": 16777, "chicag": 16778, "gucci</w>": 16779, "ðŁİĥ</w>": 16780, "cultiv": 16781, "pierce</w>": 16782, "lifted</w>": 16783, "stumb": 16784, "recover</w>": 16785, "muscles</w>": 16786, "conducting</w>": 16787, "cbs": 16788, "mclaren</w>": 16789, "sophia</w>": 16790, "cellu": 16791, "oceans</w>": 16792, "uploaded</w>": 16793, "gameplay</w>": 16794, "maldives</w>": 16795, "kimber": 16796, "avoi": 16797, "racer</w>": 16798, "caine</w>": 16799, "cavs</w>": 16800, "hana</w>": 16801, "liga</w>": 16802, "raven</w>": 16803, "intervention</w>": 16804, "inauguration</w>": 16805, "ooh</w>": 16806, "attraction</w>": 16807, "merchandise</w>": 16808, "tunein</w>": 16809, "liking</w>": 16810, "juniors</w>": 16811, "intended</w>": 16812, "attacking</w>": 16813, "aquarium</w>": 16814, "iwd</w>": 16815, "components</w>": 16816, "suring</w>": 16817, "centu": 16818, "yogurt</w>": 16819, "ðŁıĥ": 16820, "showroom</w>": 16821, "optical</w>": 16822, "tyour": 16823, "judge": 16824, "yield</w>": 16825, "anto": 16826, "plc</w>": 16827, "transparency</w>": 16828, "recycled</w>": 16829, "chief": 16830, "arom": 16831, "ambassadors</w>": 16832, "planet": 16833, "âĿĦï¸ı": 16834, "omed</w>": 16835, "vanessa</w>": 16836, "court": 16837, "margar": 16838, "haley</w>": 16839, "vr": 16840, "regina</w>": 16841, "pdates</w>": 16842, "hispan": 16843, "livestream</w>": 16844, "âģ£</w>": 16845, "yahoo</w>": 16846, "galla": 16847, "secured</w>": 16848, "wir": 16849, "beneath</w>": 16850, "offl</w>": 16851, "nil": 16852, "amb</w>": 16853, "yeg": 16854, "outlet</w>": 16855, "ute": 16856, "peep</w>": 16857, "lindsay</w>": 16858, "bentley</w>": 16859, "...!</w>": 16860, "heel</w>": 16861, "trilogy</w>": 16862, "vos</w>": 16863, "tyre</w>": 16864, "therefore</w>": 16865, "toronto": 16866, "abi": 16867, "simpli": 16868, "jae": 16869, "extensive</w>": 16870, "elephants</w>": 16871, "sor</w>": 16872, "orientation</w>": 16873, "impeach": 16874, "replay</w>": 16875, "constructed</w>": 16876, "peterson</w>": 16877, "pais": 16878, "ported</w>": 16879, "customs</w>": 16880, "collap": 16881, "adu": 16882, "highlands</w>": 16883, "salem</w>": 16884, "shelby</w>": 16885, "kovic</w>": 16886, "strain</w>": 16887, "rosie</w>": 16888, "senators</w>": 16889, "snaps</w>": 16890, "bobb": 16891, "suzuki</w>": 16892, "blades</w>": 16893, "kp</w>": 16894, "lolo": 16895, "generate</w>": 16896, "sight": 16897, "mae": 16898, "structural</w>": 16899, "predict</w>": 16900, "jumped</w>": 16901, "ahmad</w>": 16902, "sung": 16903, "justice": 16904, "glam</w>": 16905, "volvo</w>": 16906, "jubilee</w>": 16907, "detention</w>": 16908, "losses</w>": 16909, "puri": 16910, "everytime</w>": 16911, "Ð°": 16912, "rao</w>": 16913, "edge": 16914, "limer": 16915, "resemb": 16916, "harold</w>": 16917, "retri": 16918, "sacrific": 16919, "surprises</w>": 16920, "amc</w>": 16921, "srilanka</w>": 16922, "barbie</w>": 16923, "mens": 16924, "finn</w>": 16925, "ags</w>": 16926, "ukrainian</w>": 16927, "embrac": 16928, "îĲ": 16929, "flavors</w>": 16930, "homer</w>": 16931, "laure": 16932, "outh": 16933, "priced</w>": 16934, "verde</w>": 16935, "firm": 16936, "ahs</w>": 16937, "cub": 16938, "trey</w>": 16939, "paranor": 16940, "profit": 16941, "indv": 16942, "whoa</w>": 16943, "harsh</w>": 16944, "alot</w>": 16945, "critics</w>": 16946, "hubby</w>": 16947, "figur": 16948, "gira": 16949, "castro</w>": 16950, "chanel</w>": 16951, "input</w>": 16952, "originals</w>": 16953, "tenant</w>": 16954, "yyyy</w>": 16955, "turers</w>": 16956, "lincoln": 16957, "coon</w>": 16958, "learn": 16959, "chou": 16960, "acare</w>": 16961, "oles</w>": 16962, "diner</w>": 16963, "hyp": 16964, "bizarre</w>": 16965, "mcr</w>": 16966, "letsgo": 16967, "decorating</w>": 16968, "ðŁĮİ</w>": 16969, "alison</w>": 16970, "arvin": 16971, "fd": 16972, "rehab</w>": 16973, "mccarthy</w>": 16974, "lottery</w>": 16975, "dah": 16976, "minneapolis</w>": 16977, "eligible</w>": 16978, "diagnosed</w>": 16979, "emerald</w>": 16980, "destinations</w>": 16981, "sans</w>": 16982, "ory": 16983, "blazers</w>": 16984, "nv</w>": 16985, "bail</w>": 16986, "digitalart</w>": 16987, "noc": 16988, "malta</w>": 16989, "solar": 16990, "pipes</w>": 16991, "allegations</w>": 16992, "nock</w>": 16993, "pope": 16994, "brid": 16995, "premier": 16996, "nx</w>": 16997, "presentations</w>": 16998, "efa</w>": 16999, "bows</w>": 17000, "valve</w>": 17001, "opponent</w>": 17002, "Įë": 17003, "visual": 17004, "ingle</w>": 17005, "categor": 17006, "eter</w>": 17007, "pois": 17008, "dani</w>": 17009, "attract</w>": 17010, "neutral</w>": 17011, "thene": 17012, "crashes</w>": 17013, "freddie</w>": 17014, "utili": 17015, "cst</w>": 17016, "awakening</w>": 17017, "sloven": 17018, "qualify</w>": 17019, "proof": 17020, "fairy": 17021, "lev": 17022, "freight</w>": 17023, "enjoys</w>": 17024, "cupcake</w>": 17025, "flavour</w>": 17026, "âķ": 17027, "protective</w>": 17028, "ðŁĳıðŁı»</w>": 17029, "isu": 17030, "admir": 17031, "hmmm</w>": 17032, "continuous</w>": 17033, "aires</w>": 17034, "raptors</w>": 17035, "showcasing</w>": 17036, "yuk": 17037, "paste</w>": 17038, "follower</w>": 17039, "instructions</w>": 17040, "spru": 17041, "@__</w>": 17042, "theo": 17043, "debuts</w>": 17044, "vette</w>": 17045, "stow</w>": 17046, "esof": 17047, "ached</w>": 17048, "sultan</w>": 17049, "sandwich": 17050, "somalia</w>": 17051, "franco</w>": 17052, "carne": 17053, "fluffy</w>": 17054, "alpine</w>": 17055, "jasmine</w>": 17056, "heated</w>": 17057, "violin</w>": 17058, "pless</w>": 17059, "divorce</w>": 17060, "performer</w>": 17061, "phies</w>": 17062, "portsm": 17063, "dara</w>": 17064, "kirby</w>": 17065, "lop</w>": 17066, "chilli</w>": 17067, "forth": 17068, "skype</w>": 17069, "ðŁĩ®ðŁĩ¹</w>": 17070, "celebrities</w>": 17071, "edy": 17072, "vee</w>": 17073, "poison</w>": 17074, "eyel": 17075, "grabs</w>": 17076, "ssic</w>": 17077, "uno</w>": 17078, "western": 17079, "railroad</w>": 17080, "amer": 17081, "numerous</w>": 17082, "sv</w>": 17083, "fow": 17084, "fist</w>": 17085, "âĢĭ": 17086, "requests</w>": 17087, "martial</w>": 17088, "emmy</w>": 17089, "acceptance</w>": 17090, "laura": 17091, "à¸´</w>": 17092, "erup": 17093, "hyundai</w>": 17094, "outlander</w>": 17095, "utt": 17096, "wrestle": 17097, "espresso</w>": 17098, "demanding</w>": 17099, "gdp</w>": 17100, "geography</w>": 17101, "saskat": 17102, "troll</w>": 17103, "confeder": 17104, "sues</w>": 17105, "sem</w>": 17106, "bets</w>": 17107, "tful</w>": 17108, "tosh</w>": 17109, "teaches</w>": 17110, "coloured</w>": 17111, "galway</w>": 17112, "macy</w>": 17113, "disorders</w>": 17114, "bbcra": 17115, "atem": 17116, "fender</w>": 17117, "litter</w>": 17118, "esh": 17119, "providers</w>": 17120, "renovation</w>": 17121, "nominate</w>": 17122, "psg</w>": 17123, "nominations</w>": 17124, "jenna</w>": 17125, "sharp": 17126, "someday</w>": 17127, "zur": 17128, "brains</w>": 17129, "cheshire</w>": 17130, "prey</w>": 17131, "hugo</w>": 17132, "Â¿</w>": 17133, "token</w>": 17134, "rv": 17135, "carr</w>": 17136, "tactical</w>": 17137, "zelda</w>": 17138, "kayla</w>": 17139, "fernando</w>": 17140, "photographers</w>": 17141, "jour</w>": 17142, "umbrella</w>": 17143, "woody</w>": 17144, "congressman</w>": 17145, "dump</w>": 17146, "levy</w>": 17147, "juan": 17148, "dazz": 17149, "signals</w>": 17150, "lain</w>": 17151, "anu</w>": 17152, "michel</w>": 17153, "porch</w>": 17154, "alden": 17155, "siblings</w>": 17156, "yale</w>": 17157, "peel</w>": 17158, "swick</w>": 17159, "ggin</w>": 17160, "llc</w>": 17161, "kale": 17162, "scon": 17163, "ild</w>": 17164, "patreon</w>": 17165, "reel</w>": 17166, "quin</w>": 17167, "witt</w>": 17168, "marty</w>": 17169, "moody</w>": 17170, "toni</w>": 17171, "dery</w>": 17172, "gators</w>": 17173, "specifically</w>": 17174, "ddin</w>": 17175, "lyon</w>": 17176, "trick": 17177, "meadows</w>": 17178, "pj</w>": 17179, "borgh": 17180, "vik</w>": 17181, "tur</w>": 17182, "bronx</w>": 17183, "puff</w>": 17184, "lantern</w>": 17185, "ðŁ¤¦": 17186, "gently</w>": 17187, "bestie</w>": 17188, "fact": 17189, "refused</w>": 17190, "fasci": 17191, "mpy</w>": 17192, "ðŁĶµ</w>": 17193, "crossover</w>": 17194, "meadow</w>": 17195, "indianapolis</w>": 17196, "ducation</w>": 17197, "sley": 17198, "loom</w>": 17199, "mixer</w>": 17200, "newmusic</w>": 17201, "filmmaker</w>": 17202, "prosperity</w>": 17203, "lim</w>": 17204, "weekend": 17205, "creamy</w>": 17206, "neutr": 17207, "luther</w>": 17208, "hv": 17209, "northern": 17210, "two": 17211, "hra</w>": 17212, "catches</w>": 17213, "appearances</w>": 17214, "habit</w>": 17215, "kittens</w>": 17216, "nv": 17217, "illac</w>": 17218, "infan": 17219, "regardless</w>": 17220, "lizard</w>": 17221, "dunk</w>": 17222, "curtain</w>": 17223, "acom": 17224, "intu": 17225, "vez</w>": 17226, "emin": 17227, "flats</w>": 17228, "calendars</w>": 17229, "empower</w>": 17230, "ruined</w>": 17231, "hungary</w>": 17232, "vid": 17233, "wex": 17234, "ulum</w>": 17235, "aberdeen</w>": 17236, "osa</w>": 17237, "kt": 17238, "massi": 17239, "seemed</w>": 17240, "sden</w>": 17241, "'?</w>": 17242, "telephone</w>": 17243, "defi": 17244, "inspires</w>": 17245, "meow</w>": 17246, "zones</w>": 17247, "blind": 17248, "ply": 17249, "tucson</w>": 17250, "adventure": 17251, "ged": 17252, "oyster</w>": 17253, "ðŁĳıðŁĳıðŁĳı</w>": 17254, "output</w>": 17255, "ttt</w>": 17256, "metallic</w>": 17257, "smash": 17258, "ucla</w>": 17259, "scots</w>": 17260, "perfect": 17261, "lucy": 17262, "regularly</w>": 17263, "spic": 17264, "relative</w>": 17265, "athers</w>": 17266, "mise</w>": 17267, "battling</w>": 17268, "decides</w>": 17269, "mata</w>": 17270, "occupied</w>": 17271, "randomly</w>": 17272, "catsoftwitter</w>": 17273, "gian": 17274, "bally": 17275, "alties</w>": 17276, "allies</w>": 17277, "immen": 17278, "syrac": 17279, "ðŁĴľðŁĴľ": 17280, "llan": 17281, "aur</w>": 17282, "kut": 17283, "lamar</w>": 17284, "affects</w>": 17285, "nra</w>": 17286, "starwar": 17287, "ðŁ¤ĺ</w>": 17288, "scram": 17289, "enchan": 17290, "process": 17291, "luxurious</w>": 17292, "array</w>": 17293, "sherlock</w>": 17294, "compati": 17295, "dorf</w>": 17296, "stress": 17297, "msu</w>": 17298, "swith": 17299, "sala</w>": 17300, "sofinstagram</w>": 17301, "foil</w>": 17302, "understood</w>": 17303, "quay</w>": 17304, "rp": 17305, "cade</w>": 17306, "jaw</w>": 17307, "enab": 17308, "encoun": 17309, "ðŁİī:</w>": 17310, "dock": 17311, "saturn</w>": 17312, "mull": 17313, "layout</w>": 17314, "rarely</w>": 17315, "happily</w>": 17316, "fixture</w>": 17317, "orph": 17318, "overlooking</w>": 17319, "herbs</w>": 17320, "mitt": 17321, "pillar</w>": 17322, "nolan</w>": 17323, "petty</w>": 17324, "stry": 17325, "ui": 17326, "muk": 17327, "ores</w>": 17328, "overs</w>": 17329, "áµ": 17330, "recreation</w>": 17331, "wesley</w>": 17332, "rit</w>": 17333, "kejriwal</w>": 17334, "stocking</w>": 17335, "gv</w>": 17336, "subscribers</w>": 17337, "moose</w>": 17338, "mae</w>": 17339, "bert": 17340, "oppre": 17341, "assignment</w>": 17342, "uro": 17343, "highlighting</w>": 17344, "calvin</w>": 17345, "weigh</w>": 17346, "cambodia</w>": 17347, "avon</w>": 17348, "kem</w>": 17349, "disabilities</w>": 17350, "ready": 17351, "chargers</w>": 17352, "pads</w>": 17353, "izing</w>": 17354, "illian</w>": 17355, "truste": 17356, "colleges</w>": 17357, "associates</w>": 17358, "albany</w>": 17359, "milton</w>": 17360, "cron": 17361, "bur</w>": 17362, "hardly</w>": 17363, "sights</w>": 17364, "antiques</w>": 17365, "echo": 17366, "surprisingly</w>": 17367, "haiti</w>": 17368, "capt</w>": 17369, "php</w>": 17370, "opio": 17371, "inequality</w>": 17372, "equal": 17373, "keny": 17374, "schmid": 17375, "autographs</w>": 17376, "rent": 17377, "quer": 17378, "citrus</w>": 17379, "challenged</w>": 17380, "tec": 17381, "epide": 17382, "fest": 17383, "zhou</w>": 17384, "lime": 17385, "citizenship</w>": 17386, "crystal": 17387, "convinced</w>": 17388, "messenger</w>": 17389, "copenhagen</w>": 17390, "âĿĹï¸ı</w>": 17391, "warran": 17392, "developments</w>": 17393, "ï¸ıâĥ£": 17394, "forex</w>": 17395, "hiro": 17396, "sneakers</w>": 17397, "xide</w>": 17398, "viva</w>": 17399, "stereo</w>": 17400, "batting</w>": 17401, "ssel": 17402, "host": 17403, "bengal": 17404, "criticism</w>": 17405, "qc</w>": 17406, "crun": 17407, "attempted</w>": 17408, "rye</w>": 17409, "determination</w>": 17410, "creations</w>": 17411, "dread": 17412, "labels</w>": 17413, "posse": 17414, "ancer</w>": 17415, "johan": 17416, "sister": 17417, "partnerships</w>": 17418, "lesbian</w>": 17419, "kst</w>": 17420, "guarantee</w>": 17421, "baro": 17422, "fixing</w>": 17423, "mason": 17424, "mous</w>": 17425, "chemicals</w>": 17426, "tless</w>": 17427, "biodiversity</w>": 17428, "paro": 17429, "bharat</w>": 17430, "acol": 17431, "refuge</w>": 17432, "ente": 17433, "titi": 17434, "dyssey</w>": 17435, "responds</w>": 17436, "lefto": 17437, "iner": 17438, "sevel": 17439, "rahul</w>": 17440, "oline</w>": 17441, "frankfur": 17442, "choreo": 17443, "enjoyable</w>": 17444, "cto</w>": 17445, "struggles</w>": 17446, "woodland</w>": 17447, "heavyweight</w>": 17448, "gens</w>": 17449, "recep": 17450, "accred": 17451, "ðŁĺ¡</w>": 17452, "transformed</w>": 17453, "listen": 17454, "atop</w>": 17455, "nk</w>": 17456, "surge</w>": 17457, "bere": 17458, "governor": 17459, "prisoners</w>": 17460, "claude</w>": 17461, "till": 17462, "mulator</w>": 17463, "emotion</w>": 17464, "waterloo</w>": 17465, "start": 17466, "ðŁĩº</w>": 17467, "cleaned</w>": 17468, "grandmother</w>": 17469, "fearless</w>": 17470, "african": 17471, "astronomy</w>": 17472, "ðŁıģ</w>": 17473, "à¸Ļ": 17474, "theworld</w>": 17475, "suitable</w>": 17476, "anthony": 17477, "kand": 17478, "tten</w>": 17479, "meaningful</w>": 17480, "disclo": 17481, "jacobs</w>": 17482, "Ã¸": 17483, "tomlinson</w>": 17484, "ghetti</w>": 17485, "typho": 17486, "substan": 17487, "asco</w>": 17488, "tek": 17489, "nagar</w>": 17490, "mud": 17491, "amon": 17492, "vaccine</w>": 17493, "fty</w>": 17494, "flesh</w>": 17495, "noel</w>": 17496, "inflation</w>": 17497, "portugue": 17498, "glamour</w>": 17499, "tram</w>": 17500, "vre</w>": 17501, "tequ": 17502, "roundup</w>": 17503, "wyn</w>": 17504, "rejected</w>": 17505, "mosaic</w>": 17506, "sighting</w>": 17507, "calf</w>": 17508, "ota": 17509, "composition</w>": 17510, "gopro</w>": 17511, "gonzale": 17512, "eed</w>": 17513, "bard</w>": 17514, "tue</w>": 17515, "effectively</w>": 17516, "ween": 17517, "alto</w>": 17518, "ribs</w>": 17519, "relate</w>": 17520, "thirsty</w>": 17521, "furious</w>": 17522, "dim</w>": 17523, "chard</w>": 17524, "perfume</w>": 17525, "sny": 17526, "churchill</w>": 17527, "kof": 17528, "masterclass</w>": 17529, "wave": 17530, "ðŁĶµ": 17531, "erin": 17532, "owns</w>": 17533, "tobe": 17534, "skilled</w>": 17535, "tem</w>": 17536, "gof": 17537, "eni</w>": 17538, "tori</w>": 17539, "crazy": 17540, "lick</w>": 17541, "resistant</w>": 17542, "icial": 17543, "agar</w>": 17544, "!:</w>": 17545, "gali": 17546, "delaware</w>": 17547, "blitz</w>": 17548, "kohli</w>": 17549, "puck</w>": 17550, "availability</w>": 17551, "himalay": 17552, "influential</w>": 17553, "crochet</w>": 17554, "victori": 17555, "reading": 17556, "hobby</w>": 17557, "viet": 17558, "jas</w>": 17559, "engra": 17560, "skul": 17561, "ðŁĩ²ðŁĩ": 17562, "educate</w>": 17563, "techno</w>": 17564, "districts</w>": 17565, "blues": 17566, "sett</w>": 17567, "seventh</w>": 17568, "learns</w>": 17569, "eeee</w>": 17570, "apocalypse</w>": 17571, "hangout</w>": 17572, "cruel</w>": 17573, "mutu": 17574, "bruh</w>": 17575, "helen": 17576, "sheer</w>": 17577, "ction": 17578, "klein</w>": 17579, "texans</w>": 17580, "cereal</w>": 17581, "shine": 17582, "nered</w>": 17583, "gras</w>": 17584, "ambro": 17585, "fella</w>": 17586, "hindu": 17587, "matthew": 17588, "lima</w>": 17589, "miranda</w>": 17590, "jewel</w>": 17591, "soho</w>": 17592, "eurovision</w>": 17593, "neighbours</w>": 17594, "chandler</w>": 17595, "besides</w>": 17596, "ðŁ¥°</w>": 17597, "astros</w>": 17598, "thumbs</w>": 17599, "renault</w>": 17600, "rave</w>": 17601, "hired</w>": 17602, "ðŁĸ¤": 17603, "itary</w>": 17604, "zor": 17605, "blazer</w>": 17606, "kine": 17607, "eau</w>": 17608, "katy": 17609, "dccomics</w>": 17610, "pec</w>": 17611, "rodgers</w>": 17612, "waterproof</w>": 17613, "killers</w>": 17614, "superint": 17615, "preserv": 17616, "asso</w>": 17617, "brewers</w>": 17618, "promotional</w>": 17619, "scam": 17620, "villages</w>": 17621, "sketches</w>": 17622, "juicy</w>": 17623, "forlife</w>": 17624, "audit</w>": 17625, "solo": 17626, "fundamental</w>": 17627, "lene</w>": 17628, "philippine</w>": 17629, "tend": 17630, "conservatives</w>": 17631, "sponsorship</w>": 17632, "ddle": 17633, "aine</w>": 17634, "htc</w>": 17635, "osi</w>": 17636, "hulk</w>": 17637, "waf": 17638, "à¸Ļ</w>": 17639, "evaluation</w>": 17640, "antine</w>": 17641, "slee": 17642, "robertson</w>": 17643, "roosevel": 17644, "agi</w>": 17645, "sophistic": 17646, "employers</w>": 17647, "bubbles</w>": 17648, "kowski</w>": 17649, "interaction</w>": 17650, "shu</w>": 17651, "boule": 17652, "ican": 17653, "jare": 17654, "hank</w>": 17655, "legitim": 17656, "knicks</w>": 17657, "karma</w>": 17658, "receiver</w>": 17659, "perks</w>": 17660, "uh": 17661, "stair</w>": 17662, "suni": 17663, "laboratory</w>": 17664, "graves</w>": 17665, "vocals</w>": 17666, "oot</w>": 17667, "cture</w>": 17668, "thrive</w>": 17669, "tico</w>": 17670, "ãĥ³": 17671, "bw": 17672, "cartoons</w>": 17673, "mcdonalds</w>": 17674, "draw": 17675, "yung</w>": 17676, "pler</w>": 17677, "lid</w>": 17678, "ethical</w>": 17679, "groove</w>": 17680, "enta</w>": 17681, "internationalwomensday</w>": 17682, "patron</w>": 17683, "worries</w>": 17684, "ðŁİħ": 17685, "ðŁĳĭ</w>": 17686, "katherine</w>": 17687, "diaz</w>": 17688, "tori": 17689, "bachchan</w>": 17690, "trust": 17691, "mineral</w>": 17692, "icom": 17693, "builders</w>": 17694, "born": 17695, "coloring</w>": 17696, "latte</w>": 17697, "case": 17698, "revolution": 17699, "trader</w>": 17700, "oxid": 17701, "chipot": 17702, "instantly</w>": 17703, "southern": 17704, "sehun</w>": 17705, "prob": 17706, "hernandez</w>": 17707, "lisbon</w>": 17708, "huawe": 17709, "pong</w>": 17710, "mea</w>": 17711, "rooney</w>": 17712, "wheelchair</w>": 17713, "keen": 17714, "bett": 17715, "corin": 17716, "regulatory</w>": 17717, "displac": 17718, "karen": 17719, "schem": 17720, "sunsets</w>": 17721, "whales</w>": 17722, "reminis": 17723, "hep": 17724, "hide": 17725, "marcel": 17726, "pandora</w>": 17727, "doyle</w>": 17728, "thfc</w>": 17729, "otto</w>": 17730, "nokia</w>": 17731, "transgender</w>": 17732, "kov": 17733, "hawaiian</w>": 17734, "shave</w>": 17735, "sovere": 17736, "excer": 17737, "nicki</w>": 17738, "pug</w>": 17739, "stor</w>": 17740, "roth": 17741, "weet</w>": 17742, "legal": 17743, "dignity</w>": 17744, "pow</w>": 17745, "homage</w>": 17746, "ðŁĩ³ðŁĩ": 17747, "sre": 17748, "canon": 17749, "lax": 17750, "woah</w>": 17751, "quartz</w>": 17752, "Ã±a</w>": 17753, "greeting</w>": 17754, "flickr</w>": 17755, "nairobi</w>": 17756, "advocates</w>": 17757, "anc</w>": 17758, "vii</w>": 17759, "eugene</w>": 17760, "thra": 17761, "cre</w>": 17762, "elan": 17763, "pension</w>": 17764, "thletics</w>": 17765, "toni": 17766, "reagan</w>": 17767, "xv</w>": 17768, "store": 17769, "bench": 17770, "harlem</w>": 17771, "toddler</w>": 17772, "sentenced</w>": 17773, "âĻ¥ï¸ı": 17774, "globally</w>": 17775, "cheaper</w>": 17776, "uf": 17777, "mam</w>": 17778, "nico</w>": 17779, "iku</w>": 17780, "thou</w>": 17781, "nist</w>": 17782, "dami": 17783, "thala</w>": 17784, "rhodes</w>": 17785, "sale": 17786, "bowls</w>": 17787, "âĪ": 17788, "lasvegas</w>": 17789, "sanctions</w>": 17790, "admire</w>": 17791, "matched</w>": 17792, "unable</w>": 17793, "traveler</w>": 17794, "eleven</w>": 17795, "strawberries</w>": 17796, "âĢĶâĢĶâĢĶâĢĶ": 17797, "studio": 17798, "jacques</w>": 17799, "ims</w>": 17800, "valued</w>": 17801, "sno</w>": 17802, "cheesecake</w>": 17803, "nxt</w>": 17804, "eos</w>": 17805, "sx</w>": 17806, "fx": 17807, "tonic</w>": 17808, "hatch</w>": 17809, "chicks</w>": 17810, "grads</w>": 17811, "handic": 17812, "rory</w>": 17813, "asp": 17814, "ripped</w>": 17815, "dentist</w>": 17816, "nen": 17817, "lufc</w>": 17818, "âľĬ</w>": 17819, "dige": 17820, "hopkins</w>": 17821, "sherman</w>": 17822, "fda</w>": 17823, "forall</w>": 17824, "ashley": 17825, "strand</w>": 17826, "hy</w>": 17827, "liquor</w>": 17828, "buffet</w>": 17829, "essence</w>": 17830, "pharma</w>": 17831, "suriya</w>": 17832, "ðŁĴĻðŁĴĻ": 17833, "festivals</w>": 17834, "zan</w>": 17835, "refresh": 17836, "purple": 17837, "uniforms</w>": 17838, "kenneth</w>": 17839, "=)</w>": 17840, "asan</w>": 17841, "helsin": 17842, "transformers</w>": 17843, "kali": 17844, "personalized</w>": 17845, "chalk</w>": 17846, "bobby": 17847, "âĮ": 17848, "themes</w>": 17849, "departure</w>": 17850, "print": 17851, "illustrations</w>": 17852, "quiet": 17853, "agrees</w>": 17854, "griff": 17855, "Ø³": 17856, "miti": 17857, "together": 17858, "convenience</w>": 17859, "abar": 17860, "carlo": 17861, "turtles</w>": 17862, "infosec</w>": 17863, "somewhat</w>": 17864, "arlington</w>": 17865, "scholarships</w>": 17866, "emirates</w>": 17867, "mums</w>": 17868, "stella</w>": 17869, "autonom": 17870, "feather</w>": 17871, "gore</w>": 17872, "nominees</w>": 17873, "fragrance</w>": 17874, "ÑĤ": 17875, "wong</w>": 17876, "theastern</w>": 17877, "gre</w>": 17878, "zilla</w>": 17879, "isi</w>": 17880, "bumper</w>": 17881, "goo</w>": 17882, "dozens</w>": 17883, "abduc": 17884, "âļªï¸ı</w>": 17885, "oils</w>": 17886, "donors</w>": 17887, "silicon</w>": 17888, "ipod</w>": 17889, "fortnite</w>": 17890, "ðŁĴ¨</w>": 17891, "toro</w>": 17892, "sparkling</w>": 17893, "consciousness</w>": 17894, "pala</w>": 17895, "num": 17896, "mounted</w>": 17897, "ffins</w>": 17898, "thieves</w>": 17899, "teammate</w>": 17900, "prab": 17901, "omer</w>": 17902, "tapes</w>": 17903, "bod": 17904, "mitsu": 17905, "stew</w>": 17906, "ere": 17907, "pbs</w>": 17908, "tusc": 17909, "lowe</w>": 17910, "rade</w>": 17911, "parliamentary</w>": 17912, "hm": 17913, "edgar</w>": 17914, "ðŁĳĩðŁĳĩ": 17915, "toa": 17916, "agh": 17917, "honi</w>": 17918, "slate</w>": 17919, "geek": 17920, "apt</w>": 17921, "hardt</w>": 17922, "tap": 17923, "horizon": 17924, "growth": 17925, "makeover</w>": 17926, "hil</w>": 17927, "paperback</w>": 17928, "idan</w>": 17929, "rehabil": 17930, "giu": 17931, "possibilities</w>": 17932, "lettu": 17933, "franco": 17934, "boss": 17935, "acher</w>": 17936, "doesnt</w>": 17937, "moe</w>": 17938, "taker</w>": 17939, "hussain</w>": 17940, "mlk</w>": 17941, "dil</w>": 17942, "thia</w>": 17943, "hama</w>": 17944, "realised</w>": 17945, "ravens</w>": 17946, "curriculum</w>": 17947, "mith</w>": 17948, "knight": 17949, "tedx": 17950, "rv</w>": 17951, "isaiah</w>": 17952, "cumbria</w>": 17953, "birthdays</w>": 17954, "fing</w>": 17955, "prez</w>": 17956, "mubarak</w>": 17957, "exquisite</w>": 17958, "clearance</w>": 17959, "yen</w>": 17960, "pari": 17961, "evo": 17962, "Ãº": 17963, "modified</w>": 17964, "applying</w>": 17965, "implement</w>": 17966, "discovering</w>": 17967, "chapman</w>": 17968, "indiegame</w>": 17969, "disk</w>": 17970, "crowdfunding</w>": 17971, "machin": 17972, "livel": 17973, "styled</w>": 17974, "âĿĮ</w>": 17975, "making": 17976, "rehearsals</w>": 17977, "nutriti": 17978, "subscription</w>": 17979, "andro</w>": 17980, "creators</w>": 17981, "carries</w>": 17982, "kylie</w>": 17983, "camden</w>": 17984, "apprentice</w>": 17985, "taxpay": 17986, "cca</w>": 17987, "tuesdaythoughts</w>": 17988, "pissed</w>": 17989, "erman</w>": 17990, "detec": 17991, "freedom": 17992, "meri": 17993, "..!</w>": 17994, "psalm</w>": 17995, "sunlight</w>": 17996, "perspec": 17997, "beings</w>": 17998, "bookstore</w>": 17999, "rockstar</w>": 18000, "functions</w>": 18001, "pence</w>": 18002, "faves</w>": 18003, "zn</w>": 18004, "obamacare</w>": 18005, "spill</w>": 18006, "coventry</w>": 18007, "pigeon</w>": 18008, "pivo": 18009, "bait</w>": 18010, "kolkata</w>": 18011, "aval": 18012, "donor</w>": 18013, "wah</w>": 18014, "privileg": 18015, "traditions</w>": 18016, "rajasthan</w>": 18017, "teness</w>": 18018, "portuguese</w>": 18019, "ynes</w>": 18020, "tackles</w>": 18021, "defic": 18022, "torn</w>": 18023, "polling</w>": 18024, "thorne</w>": 18025, "ina": 18026, "benedict</w>": 18027, "barry": 18028, "calories</w>": 18029, "verdict</w>": 18030, "savethe": 18031, "norton</w>": 18032, "office": 18033, "mainstream</w>": 18034, "improves</w>": 18035, "fron</w>": 18036, "responding</w>": 18037, "realtor</w>": 18038, "scottish": 18039, "declar": 18040, "rl": 18041, "shiv": 18042, "supplier</w>": 18043, "resting</w>": 18044, "sweets</w>": 18045, "qui</w>": 18046, ".âĢ¦</w>": 18047, "whitney</w>": 18048, "startup": 18049, "thankyou": 18050, "teacher": 18051, "halls</w>": 18052, "have": 18053, "handmade": 18054, "proving</w>": 18055, "quartet</w>": 18056, "rochester</w>": 18057, "lian</w>": 18058, "virtual": 18059, "mendes</w>": 18060, "oficial</w>": 18061, "midlands</w>": 18062, "xbox": 18063, "measuring</w>": 18064, "ovo</w>": 18065, "accommodation</w>": 18066, "brides</w>": 18067, "collegiate</w>": 18068, "intellectual</w>": 18069, "incar": 18070, "niag": 18071, "ðŁį·</w>": 18072, "sfw</w>": 18073, "cocoa</w>": 18074, "coats</w>": 18075, "civilians</w>": 18076, "presidency</w>": 18077, "matrix</w>": 18078, "sweetheart</w>": 18079, "triathlon</w>": 18080, "wagner</w>": 18081, "radic": 18082, "planner</w>": 18083, "theo</w>": 18084, "execution</w>": 18085, "kum": 18086, "thewalkingdead</w>": 18087, "scar</w>": 18088, "rotation</w>": 18089, "blogging</w>": 18090, "bomb": 18091, "reson": 18092, "bbles</w>": 18093, "stare</w>": 18094, "assisted</w>": 18095, "edo</w>": 18096, "branded</w>": 18097, "warnings</w>": 18098, "thorpe</w>": 18099, "acknowle": 18100, "satisfied</w>": 18101, "shores</w>": 18102, "rid": 18103, "dora</w>": 18104, "physically</w>": 18105, "bigh": 18106, "approves</w>": 18107, "hah": 18108, "rical</w>": 18109, "versatile</w>": 18110, "pretend</w>": 18111, "lum": 18112, "abhi": 18113, "yee": 18114, "spit": 18115, "ãĢĮ</w>": 18116, "djs</w>": 18117, "ashtra</w>": 18118, "jt</w>": 18119, "venues</w>": 18120, "grammys</w>": 18121, "cyclo": 18122, "tracker</w>": 18123, "overwatch</w>": 18124, "replica</w>": 18125, "elyn</w>": 18126, "nrl</w>": 18127, "lindsey</w>": 18128, "homo": 18129, "balloons</w>": 18130, "kitchen": 18131, "sis": 18132, "amos</w>": 18133, "endeav": 18134, "ðŁĴ»</w>": 18135, "arec": 18136, "thug</w>": 18137, "hooked</w>": 18138, "hrc</w>": 18139, "newyork": 18140, "burgh": 18141, "americas</w>": 18142, "patricia</w>": 18143, "ugu</w>": 18144, "apathy</w>": 18145, "hast": 18146, "psychi": 18147, "cork": 18148, "petrol</w>": 18149, "ðŁİ¬</w>": 18150, "aku": 18151, "popping</w>": 18152, "psychological</w>": 18153, "aux": 18154, "gma</w>": 18155, "cadillac</w>": 18156, "waste": 18157, "authent": 18158, "bristol": 18159, "name": 18160, "queer</w>": 18161, "tober": 18162, "jerry": 18163, "comin</w>": 18164, "chant": 18165, "privileged</w>": 18166, "opar": 18167, "loser</w>": 18168, "text": 18169, "marker</w>": 18170, "stries</w>": 18171, "equally</w>": 18172, "aki": 18173, "christmas": 18174, "gareth</w>": 18175, "blew</w>": 18176, "emma": 18177, "imagin": 18178, "seals</w>": 18179, "cheat</w>": 18180, "conditioning</w>": 18181, "jana</w>": 18182, "rens</w>": 18183, "daries</w>": 18184, "oasis</w>": 18185, "discounts</w>": 18186, "council": 18187, "ika</w>": 18188, "shirley</w>": 18189, "voucher</w>": 18190, "alps</w>": 18191, "wx": 18192, "qr</w>": 18193, "drift</w>": 18194, "attempting</w>": 18195, "utc</w>": 18196, "Øª": 18197, "gonzalez</w>": 18198, "mf": 18199, "joker</w>": 18200, "parallel</w>": 18201, "pare": 18202, "aspects</w>": 18203, "procedu": 18204, "np": 18205, "ama": 18206, "raleigh</w>": 18207, "brighten</w>": 18208, "guire</w>": 18209, "radiation</w>": 18210, "crescent</w>": 18211, "hob": 18212, "ille": 18213, "strand": 18214, "vore</w>": 18215, "nard</w>": 18216, "chest": 18217, "diwali</w>": 18218, "avatar</w>": 18219, "alder": 18220, "dling</w>": 18221, "pathetic</w>": 18222, "ðŁĴĺ": 18223, "spirit": 18224, "jorge</w>": 18225, "filmmaking</w>": 18226, "ðŁĻıðŁĻı": 18227, "challenger</w>": 18228, "bj</w>": 18229, "downtown": 18230, "html</w>": 18231, "adequ": 18232, "twisted</w>": 18233, "inely</w>": 18234, "('</w>": 18235, "wraps</w>": 18236, "operational</w>": 18237, "yne</w>": 18238, "nus</w>": 18239, "magnet</w>": 18240, "marketplace</w>": 18241, "healthier</w>": 18242, "snapshot</w>": 18243, "damon</w>": 18244, "interven": 18245, "federer</w>": 18246, "owls</w>": 18247, "biscuits</w>": 18248, "jp": 18249, "rodeo</w>": 18250, "blueberry</w>": 18251, "lection</w>": 18252, "frontier</w>": 18253, "summers</w>": 18254, "reyes</w>": 18255, "pedestrian</w>": 18256, "gol</w>": 18257, "caffe": 18258, "refurbi": 18259, "boulder</w>": 18260, "meghan</w>": 18261, "specialty</w>": 18262, "lass</w>": 18263, "ei</w>": 18264, "suspects</w>": 18265, "approx</w>": 18266, "rrr</w>": 18267, "rath": 18268, "stim": 18269, "crushed</w>": 18270, "hed": 18271, "whun": 18272, "loaf</w>": 18273, "crore</w>": 18274, "rivera</w>": 18275, "genetics</w>": 18276, "sock</w>": 18277, "wasted</w>": 18278, "nypd</w>": 18279, "answering</w>": 18280, "dove</w>": 18281, "bella": 18282, "olin</w>": 18283, "dun</w>": 18284, "fiji</w>": 18285, "pretty": 18286, "sparkle</w>": 18287, "yun</w>": 18288, "jd": 18289, "europa</w>": 18290, "lifts</w>": 18291, "amber": 18292, "mur</w>": 18293, "tek</w>": 18294, "boyd</w>": 18295, "royalty</w>": 18296, "indo</w>": 18297, "rib</w>": 18298, "gotham</w>": 18299, "tiest</w>": 18300, "installing</w>": 18301, "kemp": 18302, "thephoto": 18303, "cosmic</w>": 18304, ")))</w>": 18305, "wholesale</w>": 18306, "loyment</w>": 18307, "easy": 18308, "suing</w>": 18309, "settled</w>": 18310, "afp</w>": 18311, "prover": 18312, "supportive</w>": 18313, "rees</w>": 18314, "neath</w>": 18315, "deliber": 18316, "cÃ©</w>": 18317, "welcome": 18318, "picoftheday</w>": 18319, "newborn</w>": 18320, "patty</w>": 18321, "suns</w>": 18322, "siest</w>": 18323, "flint</w>": 18324, "differently</w>": 18325, "spoilers</w>": 18326, "trooper</w>": 18327, "gins</w>": 18328, "cory</w>": 18329, "lookout</w>": 18330, "equipped</w>": 18331, "tape": 18332, "toby</w>": 18333, "researcher</w>": 18334, "ush</w>": 18335, "keyes</w>": 18336, "alma</w>": 18337, "induction</w>": 18338, "kw</w>": 18339, "khar": 18340, "slick</w>": 18341, "bride": 18342, "eur": 18343, "craving</w>": 18344, "bookings</w>": 18345, "ches": 18346, "trunk</w>": 18347, "vernon</w>": 18348, "spher": 18349, "crystals</w>": 18350, "relatively</w>": 18351, "pompe": 18352, "unions</w>": 18353, "valley": 18354, "para": 18355, "want": 18356, "okc</w>": 18357, "deaf</w>": 18358, "sergio</w>": 18359, "lennon</w>": 18360, "shay</w>": 18361, "cra</w>": 18362, "vat</w>": 18363, "hee": 18364, "twe": 18365, "liquid": 18366, "poly</w>": 18367, "ðŁİģ": 18368, "bent</w>": 18369, "bearing</w>": 18370, "motorsport</w>": 18371, "barbe": 18372, "testi": 18373, "hani</w>": 18374, "financing</w>": 18375, "astronaut</w>": 18376, "watercolour</w>": 18377, "rish": 18378, "comiccon</w>": 18379, "gart": 18380, "wrong": 18381, "bern</w>": 18382, "itan</w>": 18383, "stepped</w>": 18384, "filters</w>": 18385, "clow": 18386, "mex</w>": 18387, "demons</w>": 18388, "allo</w>": 18389, "expanded</w>": 18390, "command": 18391, "eters</w>": 18392, "goats</w>": 18393, "siri</w>": 18394, "yr": 18395, "pottery</w>": 18396, "marion</w>": 18397, "ile": 18398, "elan</w>": 18399, "santo</w>": 18400, "persona</w>": 18401, "duke": 18402, "homeless": 18403, "lighted</w>": 18404, "wheeler</w>": 18405, "changer</w>": 18406, "cabbage</w>": 18407, "surreal</w>": 18408, "hamburg</w>": 18409, "smashed</w>": 18410, "stran": 18411, "knot</w>": 18412, "iart": 18413, "obi</w>": 18414, "bedro": 18415, "dial": 18416, "thick": 18417, "bingo</w>": 18418, "fus</w>": 18419, "vacuum</w>": 18420, "conve": 18421, "ative": 18422, "accuracy</w>": 18423, "account": 18424, "refer": 18425, "riz": 18426, "spiderman</w>": 18427, "bana</w>": 18428, "rite</w>": 18429, "ub": 18430, "abs": 18431, "medical": 18432, "link": 18433, "siem": 18434, ">>>></w>": 18435, "betra": 18436, "glowing</w>": 18437, "reactions</w>": 18438, "puppet</w>": 18439, "spaghetti</w>": 18440, "angs</w>": 18441, "remedi": 18442, "prayfor": 18443, "royce</w>": 18444, "charlotte": 18445, "£ï¸ı</w>": 18446, "ghet": 18447, "affecting</w>": 18448, "rode</w>": 18449, "socialist</w>": 18450, "moses</w>": 18451, "azi</w>": 18452, "oit": 18453, "reporters</w>": 18454, "cdt</w>": 18455, "aping</w>": 18456, "snat": 18457, "minimal</w>": 18458, "waist</w>": 18459, "siege</w>": 18460, ">>>>": 18461, "rig": 18462, "schmidt</w>": 18463, "hare</w>": 18464, "eca</w>": 18465, "thorn</w>": 18466, "hemp</w>": 18467, "esthe": 18468, "clyde</w>": 18469, "tha": 18470, "donut</w>": 18471, "mohamed</w>": 18472, "lingerie</w>": 18473, "legg": 18474, "carpenter</w>": 18475, "performers</w>": 18476, "dea</w>": 18477, "imagined</w>": 18478, "curse</w>": 18479, "lash</w>": 18480, "ctr</w>": 18481, "agua</w>": 18482, "roar</w>": 18483, "gri</w>": 18484, "role": 18485, "jfk</w>": 18486, "resurrec": 18487, "roosevelt</w>": 18488, "marilyn</w>": 18489, "smalle": 18490, "willis</w>": 18491, "waited</w>": 18492, "charities</w>": 18493, "theres</w>": 18494, "lik</w>": 18495, "original": 18496, "cari": 18497, "cough</w>": 18498, "cruci": 18499, "lagun": 18500, "contrast</w>": 18501, "kou": 18502, "armour</w>": 18503, "removing</w>": 18504, "tent": 18505, "mazda</w>": 18506, "brighter</w>": 18507, "thief</w>": 18508, "corner": 18509, "tequila</w>": 18510, "buzzing</w>": 18511, "albi": 18512, "pam</w>": 18513, "azure</w>": 18514, "discoun": 18515, "pixelart</w>": 18516, "possibility</w>": 18517, "hamont</w>": 18518, "trades</w>": 18519, "buda": 18520, "hive</w>": 18521, "versy</w>": 18522, "finch</w>": 18523, "transpa": 18524, "emi</w>": 18525, "terrifying</w>": 18526, "inqui": 18527, "gba</w>": 18528, "substitu": 18529, "collecti": 18530, "placing</w>": 18531, "cindy</w>": 18532, "kann": 18533, "patho": 18534, "diamond": 18535, "mourinho</w>": 18536, "guinea</w>": 18537, "anthropo": 18538, "airs</w>": 18539, "pumps</w>": 18540, "ìļ": 18541, "paso</w>": 18542, "curling</w>": 18543, "anita</w>": 18544, "residency</w>": 18545, "newh": 18546, "joon</w>": 18547, "cigarette</w>": 18548, "queue</w>": 18549, "extrac": 18550, "games": 18551, "splen": 18552, "express": 18553, "publicly</w>": 18554, "bonnie</w>": 18555, "tribune</w>": 18556, "baek": 18557, "reasonable</w>": 18558, "cor</w>": 18559, "timothy</w>": 18560, "sheeran</w>": 18561, "Ä±": 18562, "fdn</w>": 18563, "sutton</w>": 18564, "concentration</w>": 18565, "caravan</w>": 18566, "xavier</w>": 18567, "alger": 18568, "cylin": 18569, "frederick</w>": 18570, "nerve</w>": 18571, "peak": 18572, "lettuce</w>": 18573, "jail": 18574, "pregame</w>": 18575, "kavan": 18576, "upgraded</w>": 18577, "ecology</w>": 18578, "squadron</w>": 18579, "grapes</w>": 18580, "goog": 18581, "pastry</w>": 18582, "ðŁĹ£</w>": 18583, "ãĥ¼ãĥ": 18584, "milano</w>": 18585, "awaz</w>": 18586, "presenter</w>": 18587, "ðŁĮ¿</w>": 18588, "herd</w>": 18589, "kings": 18590, "template</w>": 18591, "flour</w>": 18592, "hv</w>": 18593, "kley</w>": 18594, "iya</w>": 18595, "spec</w>": 18596, "ater": 18597, "frankfurt</w>": 18598, "coch": 18599, "texting</w>": 18600, "deli</w>": 18601, "communist</w>": 18602, "regiment</w>": 18603, "eleanor</w>": 18604, "anticipated</w>": 18605, "ðŁĳĮðŁı»</w>": 18606, "thephotohour</w>": 18607, "rano</w>": 18608, "surviving</w>": 18609, "simulation</w>": 18610, "dawson</w>": 18611, "arin</w>": 18612, "aqua</w>": 18613, "mor</w>": 18614, "âĢ¦.</w>": 18615, "cino</w>": 18616, "iraqi</w>": 18617, "shaz": 18618, "dundee</w>": 18619, "wes": 18620, "drau": 18621, "hannah": 18622, "snews</w>": 18623, "occupation</w>": 18624, "steen</w>": 18625, "xm</w>": 18626, "angles</w>": 18627, "settings</w>": 18628, "guru": 18629, "knox": 18630, "orca</w>": 18631, "shaping</w>": 18632, "went": 18633, "drilling</w>": 18634, "zzie</w>": 18635, "bri</w>": 18636, "kissing</w>": 18637, "find": 18638, "maine": 18639, "âŃĲï¸ıâŃĲï¸ı": 18640, "ðŁĮį</w>": 18641, "larry": 18642, "busted</w>": 18643, "tavern</w>": 18644, "actively</w>": 18645, "-\"</w>": 18646, "replacing</w>": 18647, "nod</w>": 18648, "unlock</w>": 18649, ".\"": 18650, "âŀ¤</w>": 18651, "affiliate</w>": 18652, "tow</w>": 18653, "ln</w>": 18654, "happynewyear</w>": 18655, "dif": 18656, "jm</w>": 18657, "greenwich</w>": 18658, "controversy</w>": 18659, "dawg</w>": 18660, "condol": 18661, "savannah</w>": 18662, "compensation</w>": 18663, "touchdown</w>": 18664, "teo</w>": 18665, "ambitious</w>": 18666, "embroi": 18667, "convicted</w>": 18668, "iartg</w>": 18669, "barack": 18670, "trance</w>": 18671, "testimony</w>": 18672, "audition</w>": 18673, "thumb</w>": 18674, "myths</w>": 18675, "bex": 18676, "quez</w>": 18677, "orchid</w>": 18678, "deny</w>": 18679, "entitled</w>": 18680, "hood": 18681, "grant": 18682, "inbox</w>": 18683, "bluejays</w>": 18684, "rilla</w>": 18685, "smallest</w>": 18686, "burden</w>": 18687, "infamous</w>": 18688, "divided</w>": 18689, "boundaries</w>": 18690, "tter": 18691, "elt</w>": 18692, "wyoming</w>": 18693, "beverage</w>": 18694, "mesm": 18695, "onews</w>": 18696, "buddhist</w>": 18697, "yana</w>": 18698, "assad</w>": 18699, "isms</w>": 18700, "barrett</w>": 18701, "predicted</w>": 18702, "backto": 18703, "twit</w>": 18704, "ethere": 18705, "captains</w>": 18706, "escaped</w>": 18707, "ayo</w>": 18708, "lamborgh": 18709, "gardner</w>": 18710, "laps</w>": 18711, "kal</w>": 18712, "advertisement</w>": 18713, "insects</w>": 18714, "napo": 18715, "amen": 18716, "acy": 18717, "rand</w>": 18718, "gk</w>": 18719, "teh": 18720, "kathle": 18721, "tridge</w>": 18722, "pancake</w>": 18723, "atro": 18724, "pyramid</w>": 18725, "bula</w>": 18726, "paralym": 18727, "gauge</w>": 18728, "encies</w>": 18729, "tomy</w>": 18730, "biscuit</w>": 18731, "butcher</w>": 18732, "qualifier</w>": 18733, "county": 18734, "kei": 18735, "pools</w>": 18736, "darker</w>": 18737, "shoulders</w>": 18738, "ðŁĩºðŁĩ¸ðŁĩºðŁĩ¸": 18739, "spre": 18740, "(\"</w>": 18741, "writers": 18742, "gm": 18743, "ðŁİĵ</w>": 18744, "knit": 18745, "huff": 18746, "mtb</w>": 18747, "phillies</w>": 18748, "ost</w>": 18749, "denis</w>": 18750, "gart</w>": 18751, "licensed</w>": 18752, "interface</w>": 18753, "excel</w>": 18754, "dwell</w>": 18755, "fromthe": 18756, "cofficial</w>": 18757, "azzi</w>": 18758, "appearing</w>": 18759, "forest": 18760, "nana</w>": 18761, "keith": 18762, "manufacturers</w>": 18763, "beckham</w>": 18764, ")?</w>": 18765, "ese": 18766, "colony</w>": 18767, "delicate</w>": 18768, "utter": 18769, "mcin": 18770, "transplant</w>": 18771, "preferred</w>": 18772, "pard</w>": 18773, "arie</w>": 18774, "hub": 18775, "pods</w>": 18776, "perspectives</w>": 18777, "pict</w>": 18778, "delu": 18779, "apper</w>": 18780, "bethan": 18781, "pmo": 18782, "criminals</w>": 18783, "feminism</w>": 18784, "shack</w>": 18785, "circumstances</w>": 18786, "fellas</w>": 18787, "protesting</w>": 18788, "wax": 18789, "suggested</w>": 18790, "tator</w>": 18791, "drew": 18792, "omni": 18793, "fake": 18794, "kathy</w>": 18795, "reb</w>": 18796, "deline</w>": 18797, "berni": 18798, "misty</w>": 18799, "ðŁĳ©": 18800, "erable</w>": 18801, "breakthrough</w>": 18802, "menswear</w>": 18803, "millennials</w>": 18804, "chanyeol</w>": 18805, "laz": 18806, "insert</w>": 18807, "replies</w>": 18808, "phrase</w>": 18809, "nx": 18810, "iheartawards</w>": 18811, "audrey</w>": 18812, "granite</w>": 18813, "racec": 18814, "orie</w>": 18815, "terra</w>": 18816, "innovations</w>": 18817, "brittany</w>": 18818, "ateral</w>": 18819, "pear</w>": 18820, "biological</w>": 18821, "shments</w>": 18822, "institution</w>": 18823, "msn": 18824, "frequency</w>": 18825, "dman</w>": 18826, "neglec": 18827, "tf": 18828, "stefan</w>": 18829, "foxnews</w>": 18830, "typo": 18831, "comms</w>": 18832, "sequence</w>": 18833, "carmen</w>": 18834, "whites</w>": 18835, "economist</w>": 18836, "exeter</w>": 18837, "seum</w>": 18838, "resorts</w>": 18839, "casually</w>": 18840, "bunde": 18841, "divide</w>": 18842, "Ø¹": 18843, "gag</w>": 18844, "creed</w>": 18845, "retire</w>": 18846, "caucus</w>": 18847, "rapids</w>": 18848, "wrestlemania</w>": 18849, "tulsa</w>": 18850, "sunderland</w>": 18851, "fundament": 18852, "odi</w>": 18853, "yamaha</w>": 18854, "vary</w>": 18855, "intrigu": 18856, "else": 18857, "beacon</w>": 18858, "angie</w>": 18859, "traded</w>": 18860, "transm": 18861, "gents</w>": 18862, "knitting</w>": 18863, "galac": 18864, "ðĿĹ": 18865, "uto": 18866, "seaside</w>": 18867, "holt</w>": 18868, "rers</w>": 18869, "fargo</w>": 18870, "trainers</w>": 18871, "monsoon</w>": 18872, "bale</w>": 18873, "sought</w>": 18874, "maddie</w>": 18875, "hw</w>": 18876, "coli": 18877, "fran</w>": 18878, "favs</w>": 18879, "ðŁĴĶ": 18880, "intent</w>": 18881, "rally": 18882, "sbs</w>": 18883, "lemonade</w>": 18884, "barackobama</w>": 18885, "bread": 18886, "sticky</w>": 18887, "explosive</w>": 18888, "chelten": 18889, "tj": 18890, "assoc</w>": 18891, "ramen</w>": 18892, "homies</w>": 18893, "vlog</w>": 18894, "mister</w>": 18895, "lord": 18896, "âĢįâĻĢï¸ı": 18897, "alyssa</w>": 18898, "sketchbook</w>": 18899, "rumble</w>": 18900, "catch": 18901, "migrant</w>": 18902, "discipline</w>": 18903, "unlikely</w>": 18904, "chronicles</w>": 18905, "flora</w>": 18906, "slams</w>": 18907, "amid": 18908, "sboro</w>": 18909, "coop</w>": 18910, "jumps</w>": 18911, "tranqu": 18912, "melis": 18913, "sofia</w>": 18914, "enri": 18915, "gabe</w>": 18916, "syri": 18917, "nicolas</w>": 18918, "chai</w>": 18919, "wv": 18920, "becky</w>": 18921, "footy</w>": 18922, "tao</w>": 18923, "suppose</w>": 18924, "ðŁĺįðŁĺįðŁĺįðŁĺį</w>": 18925, "plush</w>": 18926, "rish</w>": 18927, "ðŁ¤ĵ</w>": 18928, "kha</w>": 18929, "saturdays</w>": 18930, "accent</w>": 18931, "hec": 18932, "limit": 18933, "carlton</w>": 18934, "wired</w>": 18935, "taylorswift</w>": 18936, "ðŁĺĳ</w>": 18937, "sql</w>": 18938, "harro": 18939, "recipients</w>": 18940, "gat</w>": 18941, "gop": 18942, "thof": 18943, "amazed</w>": 18944, "ghan": 18945, "ðŁıĨðŁıĨ": 18946, "porto</w>": 18947, "clare": 18948, "distant</w>": 18949, "nac</w>": 18950, "ohio": 18951, "ðŁĻıðŁı¼</w>": 18952, "mtn</w>": 18953, "antibio": 18954, "dinosa": 18955, "mesa</w>": 18956, "partial</w>": 18957, "bv": 18958, "learnt</w>": 18959, "lovato</w>": 18960, "question": 18961, "extract</w>": 18962, "gossip</w>": 18963, "gibb": 18964, "niagara</w>": 18965, "ðŁĳ¨": 18966, "displayed</w>": 18967, "sooner</w>": 18968, "stevie</w>": 18969, "nuggets</w>": 18970, "mln</w>": 18971, "brom": 18972, "turb": 18973, "giveaways</w>": 18974, "stupi": 18975, "blink</w>": 18976, "cili": 18977, "convenient</w>": 18978, "moh": 18979, "vive": 18980, "fric": 18981, "cause": 18982, "chamber": 18983, "cules</w>": 18984, "nearest</w>": 18985, "isse</w>": 18986, "smallbiz</w>": 18987, "tj</w>": 18988, "canadians</w>": 18989, "smarter</w>": 18990, "brasil</w>": 18991, "rare": 18992, "quette</w>": 18993, "wha": 18994, "candle": 18995, "atomic</w>": 18996, "ðŁĳįðŁĳį</w>": 18997, "warrior": 18998, "relaxed</w>": 18999, "strips</w>": 19000, "neur": 19001, "kka</w>": 19002, "rfc</w>": 19003, "jensen</w>": 19004, "recovering</w>": 19005, "responses</w>": 19006, "salam": 19007, "orthodox</w>": 19008, "active": 19009, "ellers</w>": 19010, "nit</w>": 19011, "âŃĲ</w>": 19012, "metropolitan</w>": 19013, "centuries</w>": 19014, "vida</w>": 19015, "grading</w>": 19016, "transparent</w>": 19017, "simple": 19018, "dots</w>": 19019, "superintendent</w>": 19020, "elevator</w>": 19021, "automated</w>": 19022, "redskins</w>": 19023, "imam</w>": 19024, "summertime</w>": 19025, "jonathan": 19026, "gearing</w>": 19027, "michelle": 19028, "conflic": 19029, "mice</w>": 19030, "tote</w>": 19031, "publish</w>": 19032, "pax</w>": 19033, ")-</w>": 19034, "nailed</w>": 19035, "á´": 19036, "telescope</w>": 19037, "serbia</w>": 19038, "bab</w>": 19039, "apeu": 19040, "stically</w>": 19041, "senti": 19042, "rats</w>": 19043, "isolated</w>": 19044, "group": 19045, "hatred</w>": 19046, "paranormal</w>": 19047, "stanley": 19048, "alion</w>": 19049, "safety": 19050, "ls": 19051, "à¤°</w>": 19052, "nexus</w>": 19053, "alexandra</w>": 19054, "masks</w>": 19055, "++</w>": 19056, "tron": 19057, "auk</w>": 19058, "brotherhood</w>": 19059, "browse</w>": 19060, "mixes</w>": 19061, "simone</w>": 19062, "musk</w>": 19063, "approve</w>": 19064, "lola</w>": 19065, "exp</w>": 19066, "perth": 19067, "futuri": 19068, "unseen</w>": 19069, "dm": 19070, "chelse": 19071, "scouting</w>": 19072, "owe</w>": 19073, "portsmouth</w>": 19074, "kram": 19075, "mize</w>": 19076, "dispen": 19077, "sup": 19078, "dlc</w>": 19079, "advert</w>": 19080, "teresa</w>": 19081, "isle": 19082, "cycle": 19083, "metall": 19084, "shields</w>": 19085, "mariners</w>": 19086, "raz</w>": 19087, "ingen</w>": 19088, "fund": 19089, "ango</w>": 19090, "jones": 19091, "oka</w>": 19092, "madden</w>": 19093, "broccoli</w>": 19094, "dominic</w>": 19095, "situations</w>": 19096, "mero</w>": 19097, "cricke": 19098, "punishment</w>": 19099, "db": 19100, "shaking</w>": 19101, "ðŁĺļ</w>": 19102, "mq": 19103, "arians</w>": 19104, "leh": 19105, "claw</w>": 19106, "weds</w>": 19107, "dure</w>": 19108, "niel": 19109, "jelly": 19110, "gourmet</w>": 19111, "traders</w>": 19112, "levi</w>": 19113, "wages</w>": 19114, "knees</w>": 19115, "wise": 19116, "heavenly</w>": 19117, "avid</w>": 19118, "melody</w>": 19119, "zack</w>": 19120, "bananas</w>": 19121, "apprentice": 19122, "prop": 19123, "funny": 19124, "ode": 19125, "respected</w>": 19126, "megan": 19127, "fewer</w>": 19128, "drafted</w>": 19129, "medit": 19130, "grape": 19131, "usarmy</w>": 19132, "crusad": 19133, "vocali": 19134, "preparations</w>": 19135, "nonsense</w>": 19136, "usage</w>": 19137, "thr</w>": 19138, "roth</w>": 19139, "wizards</w>": 19140, "inside": 19141, "promotions</w>": 19142, "mona</w>": 19143, "redsox</w>": 19144, "sig</w>": 19145, "elegance</w>": 19146, "chia</w>": 19147, "universal": 19148, "ãĢį</w>": 19149, "raja</w>": 19150, "unga</w>": 19151, "pollin": 19152, "filipino</w>": 19153, "aka": 19154, "tsun": 19155, "ikon</w>": 19156, "biking</w>": 19157, "decorations</w>": 19158, "zac</w>": 19159, "cadets</w>": 19160, "humour</w>": 19161, "agm</w>": 19162, "reppin</w>": 19163, "vaccin": 19164, "elove</w>": 19165, "uw</w>": 19166, "diabe": 19167, "gallagher</w>": 19168, "azer": 19169, "dol</w>": 19170, "awhile</w>": 19171, "prominent</w>": 19172, "welsh": 19173, "tann": 19174, "')</w>": 19175, "bien": 19176, "wag</w>": 19177, "inal</w>": 19178, "cwc</w>": 19179, "wicket</w>": 19180, "urst</w>": 19181, "qanon</w>": 19182, "xe</w>": 19183, "outdoor": 19184, "dunn</w>": 19185, "starr</w>": 19186, "cology</w>": 19187, "ricky": 19188, "uefa</w>": 19189, "rebounds</w>": 19190, "smusic</w>": 19191, "infant</w>": 19192, "ðŁĻĭ": 19193, "sop</w>": 19194, "umber": 19195, "handing</w>": 19196, "begin": 19197, "sorting</w>": 19198, "hash</w>": 19199, "spati": 19200, "rek</w>": 19201, "budapest</w>": 19202, "blackhawks</w>": 19203, "delete</w>": 19204, "rom</w>": 19205, "candid</w>": 19206, "authori": 19207, "debris</w>": 19208, "specul": 19209, "intersection</w>": 19210, "marriott</w>": 19211, "imran</w>": 19212, "ðŁĺģðŁĺģ</w>": 19213, "cruises</w>": 19214, "ramsey</w>": 19215, "rafael</w>": 19216, "awareness": 19217, "vascular</w>": 19218, "beyoncÃ©</w>": 19219, "rug</w>": 19220, "ðŁĺĮ": 19221, "festiv": 19222, "aram": 19223, "sable</w>": 19224, "basil": 19225, "pill</w>": 19226, "flooring</w>": 19227, "unbeaten</w>": 19228, "implications</w>": 19229, "uf</w>": 19230, "wound</w>": 19231, "forge</w>": 19232, "pointing</w>": 19233, "pots</w>": 19234, "popularity</w>": 19235, "ðŁĳıðŁı»": 19236, "manipul": 19237, "slots</w>": 19238, "debates</w>": 19239, "absence</w>": 19240, "vermont</w>": 19241, "neverforget</w>": 19242, "wrist": 19243, "gloria</w>": 19244, "rence": 19245, "husk": 19246, "melting</w>": 19247, "ðŁİŁ": 19248, "braces</w>": 19249, "timely</w>": 19250, "transforming</w>": 19251, "amps</w>": 19252, "mak</w>": 19253, "poe</w>": 19254, "ahan</w>": 19255, "generally</w>": 19256, "ndp</w>": 19257, "aleppo</w>": 19258, "unicef</w>": 19259, "profs</w>": 19260, "nord": 19261, "mask": 19262, "jacksonville</w>": 19263, "vv": 19264, "shells</w>": 19265, "blooming</w>": 19266, "operators</w>": 19267, "charcoal</w>": 19268, "neville</w>": 19269, "magi": 19270, "chip": 19271, "sama</w>": 19272, "iran": 19273, "reforms</w>": 19274, "accumul": 19275, "rue</w>": 19276, "æľ": 19277, "websites</w>": 19278, "gaon</w>": 19279, "devastating</w>": 19280, "stos</w>": 19281, "glacier</w>": 19282, "rapp": 19283, "chipotle</w>": 19284, "pra</w>": 19285, "orous</w>": 19286, "romney</w>": 19287, "season": 19288, "decorative</w>": 19289, "cisco</w>": 19290, "ditch</w>": 19291, "complain</w>": 19292, "llo</w>": 19293, "assume</w>": 19294, "ðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤ</w>": 19295, "nels</w>": 19296, "centric</w>": 19297, "ftw</w>": 19298, "carrots</w>": 19299, "tata</w>": 19300, "canter": 19301, "perience</w>": 19302, "liers</w>": 19303, "demos</w>": 19304, "blunt</w>": 19305, "operate</w>": 19306, "reservations</w>": 19307, "leah</w>": 19308, "substance</w>": 19309, "dison</w>": 19310, "ante": 19311, "election": 19312, "vue</w>": 19313, "square": 19314, "nonprofit</w>": 19315, "caa</w>": 19316, "fsu</w>": 19317, "yam</w>": 19318, "ãĤ¤": 19319, "vladi": 19320, "completes</w>": 19321, "mari</w>": 19322, "phillip</w>": 19323, "neill</w>": 19324, "eras": 19325, "kait": 19326, "mendo": 19327, "maharashtra</w>": 19328, "gp": 19329, "dane</w>": 19330, "providence</w>": 19331, "therapeu": 19332, "juvenile</w>": 19333, "memo</w>": 19334, "incorpor": 19335, "aaaa</w>": 19336, "seventeen</w>": 19337, "teenager</w>": 19338, "Ã£": 19339, "orns</w>": 19340, "wide": 19341, "cuteness</w>": 19342, "twd</w>": 19343, "ffles</w>": 19344, "bara</w>": 19345, "comedy": 19346, "overtime</w>": 19347, "yaz": 19348, "baron</w>": 19349, "unemployment</w>": 19350, "ðŁĳĭ": 19351, "exterior</w>": 19352, "dense</w>": 19353, "centres</w>": 19354, "matchup</w>": 19355, "historymonth</w>": 19356, "artificial": 19357, "quit": 19358, "esk": 19359, "warn</w>": 19360, "critic</w>": 19361, "jaf": 19362, "ðŁĵ²</w>": 19363, "informative</w>": 19364, "fuels</w>": 19365, "recycle</w>": 19366, "naming</w>": 19367, "stripe</w>": 19368, "solic": 19369, "molecular</w>": 19370, "deepi": 19371, "convo</w>": 19372, "ssel</w>": 19373, "nae</w>": 19374, "descent</w>": 19375, "tiz</w>": 19376, "accountability</w>": 19377, "terry": 19378, "rito</w>": 19379, "slay</w>": 19380, "emo</w>": 19381, "demol": 19382, "sensation</w>": 19383, "cov": 19384, "tore</w>": 19385, "roundtable</w>": 19386, "yol": 19387, "excuses</w>": 19388, "à¥į</w>": 19389, "turquo": 19390, "hhhh": 19391, "podcasts</w>": 19392, "celeb</w>": 19393, "messi": 19394, "lio</w>": 19395, "mann": 19396, "contributed</w>": 19397, "uz": 19398, "generator</w>": 19399, "elets</w>": 19400, "veggie</w>": 19401, "indul": 19402, "ensuring</w>": 19403, "detroit": 19404, "punjab": 19405, "transpor": 19406, "instruction</w>": 19407, "add": 19408, "porcel": 19409, "paneli": 19410, "circles</w>": 19411, "persist": 19412, "clayton</w>": 19413, "spn</w>": 19414, "dogsoftwitter</w>": 19415, "isnt</w>": 19416, "spr</w>": 19417, "retailers</w>": 19418, "pw": 19419, "hungar": 19420, "elena</w>": 19421, "monaster": 19422, "guatem": 19423, "jessie</w>": 19424, "anz</w>": 19425, "rashi": 19426, "flee": 19427, "carving</w>": 19428, "faux</w>": 19429, "lal</w>": 19430, "henri": 19431, "djo": 19432, "dull</w>": 19433, "sana</w>": 19434, "lara</w>": 19435, "globe": 19436, "crimson</w>": 19437, "compass</w>": 19438, "pause</w>": 19439, "nab</w>": 19440, "lionel</w>": 19441, "baths</w>": 19442, "ufo</w>": 19443, "inventory</w>": 19444, "singh": 19445, "satan</w>": 19446, "ðŁĩ¸": 19447, "cements</w>": 19448, "inform</w>": 19449, "generated</w>": 19450, "biden</w>": 19451, "avg</w>": 19452, "tasks</w>": 19453, "deer": 19454, "sau</w>": 19455, "jailed</w>": 19456, "pastel</w>": 19457, "scc</w>": 19458, "nail": 19459, "steele</w>": 19460, "peris": 19461, "lamborghini</w>": 19462, "pursue</w>": 19463, "margin": 19464, "uch": 19465, "bosch</w>": 19466, "drain</w>": 19467, "clara</w>": 19468, "bom</w>": 19469, "latino</w>": 19470, "webster</w>": 19471, "rosemary</w>": 19472, "rha": 19473, "soun": 19474, "billionaire</w>": 19475, "notch</w>": 19476, "percentage</w>": 19477, "conor</w>": 19478, "'\"</w>": 19479, "homes": 19480, "earthday</w>": 19481, "hort</w>": 19482, "biggest": 19483, "disin": 19484, "walton</w>": 19485, "editors</w>": 19486, "imma</w>": 19487, "omar": 19488, "equivalent</w>": 19489, "pharmaceu": 19490, "ahmed": 19491, "cameo</w>": 19492, "hanni": 19493, "underrated</w>": 19494, "gement</w>": 19495, "microbi": 19496, "voo": 19497, "honorable</w>": 19498, "obesity</w>": 19499, "âļ¡ï¸ı": 19500, "limerick</w>": 19501, "involvement</w>": 19502, "stagram</w>": 19503, "boulevard</w>": 19504, "burg": 19505, "blackandwhite</w>": 19506, "liberation</w>": 19507, "five": 19508, "interim</w>": 19509, "smm</w>": 19510, "rivalry</w>": 19511, "capabilities</w>": 19512, "statements</w>": 19513, "thumb": 19514, "ved": 19515, "swans</w>": 19516, "barber": 19517, "eque": 19518, "serena</w>": 19519, "helm</w>": 19520, "noodle</w>": 19521, "sampling</w>": 19522, "nawaz</w>": 19523, "single": 19524, "thunderstorms</w>": 19525, "shon": 19526, "inev": 19527, "ë¯": 19528, "topp": 19529, "orchard</w>": 19530, "bian": 19531, "ðŁĺĶ": 19532, "doorstep</w>": 19533, "salvation</w>": 19534, "marketing": 19535, "rons</w>": 19536, "clemson</w>": 19537, "ravi</w>": 19538, "intake</w>": 19539, "standwith": 19540, "sina</w>": 19541, "haiku</w>": 19542, "pley</w>": 19543, "electoral</w>": 19544, "philly": 19545, "lays</w>": 19546, "electric": 19547, "capturing</w>": 19548, "upp": 19549, "ergy</w>": 19550, "believing</w>": 19551, "cultures</w>": 19552, "esday</w>": 19553, "invasive</w>": 19554, "eded</w>": 19555, "speech": 19556, "endur": 19557, "vietnam": 19558, "boycott</w>": 19559, "pede</w>": 19560, "deliver": 19561, "ðŁĴĸðŁĴĸ": 19562, "merchant</w>": 19563, "stir</w>": 19564, "denies</w>": 19565, "pockets</w>": 19566, "oti": 19567, "cuddle</w>": 19568, "roland</w>": 19569, "mmed</w>": 19570, "dened</w>": 19571, "learners</w>": 19572, "hoop</w>": 19573, "sourcing</w>": 19574, "hacked</w>": 19575, "dim": 19576, "environments</w>": 19577, "benson</w>": 19578, "judicial</w>": 19579, "worcester</w>": 19580, "pearls</w>": 19581, "governments</w>": 19582, "arrivals</w>": 19583, "corners</w>": 19584, "tuning</w>": 19585, "labour": 19586, "ym</w>": 19587, "ordering</w>": 19588, "lewi": 19589, "ife</w>": 19590, "hygiene</w>": 19591, "thoughtful</w>": 19592, "indonesian</w>": 19593, "campaigning</w>": 19594, "principle</w>": 19595, "assaul": 19596, "rubb": 19597, "atv</w>": 19598, "willy</w>": 19599, "entre</w>": 19600, "ili</w>": 19601, "phon": 19602, "duties</w>": 19603, "âĻ¥âĻ¥</w>": 19604, "snakes</w>": 19605, "loop": 19606, "amar</w>": 19607, "convertible</w>": 19608, "bonding</w>": 19609, "mentoring</w>": 19610, "maxwell</w>": 19611, "ethereum</w>": 19612, "destroying</w>": 19613, "axis</w>": 19614, "cairo</w>": 19615, "finnish</w>": 19616, "shock": 19617, "ðŁĺĲ</w>": 19618, "caleb</w>": 19619, "coma</w>": 19620, "pedal</w>": 19621, "core": 19622, "continent</w>": 19623, "elson</w>": 19624, "tempo</w>": 19625, "helsinki</w>": 19626, "acp</w>": 19627, "tackling</w>": 19628, "stated</w>": 19629, "bla</w>": 19630, "doub": 19631, "smashing</w>": 19632, "aja</w>": 19633, "cameron": 19634, "disruption</w>": 19635, "warmth</w>": 19636, "beingsalmankhan</w>": 19637, "bulletin</w>": 19638, "ode</w>": 19639, "syracuse</w>": 19640, "aran</w>": 19641, "mcgregor</w>": 19642, "bulk</w>": 19643, "anton</w>": 19644, "confirmation</w>": 19645, "spine</w>": 19646, "imran": 19647, "instruc": 19648, "jacks</w>": 19649, "chio</w>": 19650, "palm": 19651, "stre</w>": 19652, "embarrassing</w>": 19653, "unt": 19654, "eliminate</w>": 19655, "toss</w>": 19656, "cise</w>": 19657, "aws</w>": 19658, "onists</w>": 19659, "shinee</w>": 19660, "jos</w>": 19661, "hose</w>": 19662, "lively</w>": 19663, "opponents</w>": 19664, "movements</w>": 19665, "recognizing</w>": 19666, "sandwiches</w>": 19667, "shakes</w>": 19668, "exercises</w>": 19669, "seat": 19670, "profession</w>": 19671, "merrychristmas</w>": 19672, "lugg": 19673, "adoptdont": 19674, "marvin</w>": 19675, "byrne</w>": 19676, "unle": 19677, "het</w>": 19678, "kuwait</w>": 19679, "rahman</w>": 19680, "aspect</w>": 19681, "humbled</w>": 19682, "genes</w>": 19683, "fand": 19684, "longtime</w>": 19685, ");</w>": 19686, "campu": 19687, "angus</w>": 19688, "ðŁĳįðŁı¼</w>": 19689, "quran</w>": 19690, "sleeves</w>": 19691, "slic": 19692, "¸ë": 19693, "twelve</w>": 19694, "youre</w>": 19695, "ike</w>": 19696, "gogh</w>": 19697, "bst</w>": 19698, "dictionary</w>": 19699, "reflecting</w>": 19700, "toon</w>": 19701, "yarn</w>": 19702, "embed": 19703, "ðŁı´</w>": 19704, "reserves</w>": 19705, "flooded</w>": 19706, "veriz": 19707, "dusk</w>": 19708, "establish</w>": 19709, "proli": 19710, "aud</w>": 19711, "ritual</w>": 19712, "orbit</w>": 19713, "declaration</w>": 19714, "recordings</w>": 19715, "camo</w>": 19716, "cassette</w>": 19717, "goodluck</w>": 19718, "cutter</w>": 19719, "bop</w>": 19720, "bho": 19721, "cheating</w>": 19722, "pacific": 19723, "mares</w>": 19724, "timer</w>": 19725, "colt</w>": 19726, "trous": 19727, "tomorrow": 19728, "hansen</w>": 19729, "cie</w>": 19730, "wang": 19731, "bani</w>": 19732, "circular</w>": 19733, "acute</w>": 19734, "farmer": 19735, "coys</w>": 19736, "pse": 19737, "irving</w>": 19738, "wj": 19739, "hawkins</w>": 19740, "bison</w>": 19741, "urday</w>": 19742, "cruising</w>": 19743, "ote</w>": 19744, "kath</w>": 19745, "whistle</w>": 19746, "yourselves</w>": 19747, "antis</w>": 19748, "slash</w>": 19749, "thoroughly</w>": 19750, "kesh</w>": 19751, "serie</w>": 19752, "exem": 19753, "enig": 19754, "guild": 19755, "shred": 19756, "hogan</w>": 19757, "apo</w>": 19758, "ä¸": 19759, "puzz": 19760, "netball</w>": 19761, "aussi": 19762, "panorama</w>": 19763, "wsj</w>": 19764, "avis</w>": 19765, "arming</w>": 19766, "humph": 19767, "browser</w>": 19768, "cries</w>": 19769, "foggy</w>": 19770, "matte</w>": 19771, "ðŁĮ»</w>": 19772, "iter</w>": 19773, "tallest</w>": 19774, "byron</w>": 19775, "captiv": 19776, "jesu": 19777, "anyways</w>": 19778, "flagship</w>": 19779, "pton</w>": 19780, "wey</w>": 19781, "fayette</w>": 19782, "financial": 19783, "foul</w>": 19784, "solomon</w>": 19785, "jennifer": 19786, "cucumber</w>": 19787, "argue</w>": 19788, "textile</w>": 19789, "wrestler</w>": 19790, "johnston</w>": 19791, "pastor": 19792, "ðŁĺŃðŁĺŃðŁĺŃðŁĺŃ": 19793, "cactus</w>": 19794, "edible</w>": 19795, "reserved</w>": 19796, "richie</w>": 19797, "metres</w>": 19798, "ingredient</w>": 19799, "hella</w>": 19800, "unto</w>": 19801, "chol": 19802, "celebs</w>": 19803, "poets</w>": 19804, "graham": 19805, "hayden</w>": 19806, "coincidence</w>": 19807, "baw": 19808, "communicate</w>": 19809, "fletcher</w>": 19810, "/-</w>": 19811, "toledo</w>": 19812, "ecuador</w>": 19813, "counsel</w>": 19814, "slaughter</w>": 19815, "linear</w>": 19816, "atp</w>": 19817, "osu</w>": 19818, "joel": 19819, "eved</w>": 19820, "conquer</w>": 19821, "rustic</w>": 19822, "plicity</w>": 19823, "recognise</w>": 19824, "roommate</w>": 19825, "cracked</w>": 19826, "jasper</w>": 19827, "pher</w>": 19828, "ðŁĮº</w>": 19829, "woven</w>": 19830, "moist": 19831, "ffc</w>": 19832, "steering</w>": 19833, "nish": 19834, "standings</w>": 19835, "frequent</w>": 19836, "ardi</w>": 19837, "hazel": 19838, "asmsg</w>": 19839, "baum</w>": 19840, "dart</w>": 19841, "sidd": 19842, "nath</w>": 19843, "chero": 19844, "cardboard</w>": 19845, "css</w>": 19846, "nsfw</w>": 19847, "pair": 19848, "ðŁĺįðŁĺĺ</w>": 19849, "occurred</w>": 19850, "homelessness</w>": 19851, "malone</w>": 19852, "phe</w>": 19853, "xia": 19854, "paddy</w>": 19855, "declare</w>": 19856, "theatre": 19857, "bf": 19858, "persian</w>": 19859, "tad</w>": 19860, "axe</w>": 19861, "suspicious</w>": 19862, "lamb": 19863, "mucho</w>": 19864, "senior": 19865, "stas</w>": 19866, "kite</w>": 19867, "sting": 19868, "grad": 19869, "kaf": 19870, "watering</w>": 19871, "Ø¯": 19872, "spiral</w>": 19873, "thms</w>": 19874, "educator</w>": 19875, "jerome</w>": 19876, "ofc</w>": 19877, "clock": 19878, "sul</w>": 19879, "pemb": 19880, ".........</w>": 19881, "parkway</w>": 19882, "deaux</w>": 19883, "restrictions</w>": 19884, "mons</w>": 19885, "needle</w>": 19886, "ej": 19887, "leagues</w>": 19888, "watermelon</w>": 19889, "aman": 19890, "plenary</w>": 19891, "maxim": 19892, "wab": 19893, "comingsoon</w>": 19894, "bryce</w>": 19895, "vigil</w>": 19896, "supermarket</w>": 19897, "fortunate</w>": 19898, "turquoise</w>": 19899, "president": 19900, "liv</w>": 19901, "interns</w>": 19902, "feelin</w>": 19903, "fixtures</w>": 19904, "stunt</w>": 19905, "staged</w>": 19906, "premieres</w>": 19907, "lok": 19908, "practiti": 19909, "shortage</w>": 19910, "logne</w>": 19911, "vec": 19912, "concor": 19913, "rocke": 19914, "lig": 19915, "composed</w>": 19916, "synthetic</w>": 19917, "dip": 19918, "camila</w>": 19919, "chis": 19920, "jou": 19921, "susan": 19922, "eyebrows</w>": 19923, "supplement</w>": 19924, "satisfaction</w>": 19925, "mohammad</w>": 19926, "tibet": 19927, "houseof": 19928, "pun</w>": 19929, "assam</w>": 19930, "shadowhun": 19931, "psyched": 19932, "seduc": 19933, "mandatory</w>": 19934, "herbert</w>": 19935, "scallo": 19936, "streamers</w>": 19937, "protocol</w>": 19938, "blockbuster</w>": 19939, "produces</w>": 19940, "schnei": 19941, "laurel</w>": 19942, "tribe": 19943, "timehop</w>": 19944, "pla</w>": 19945, "modelling</w>": 19946, "tvtime</w>": 19947, "mtvstars</w>": 19948, "widow</w>": 19949, "metric</w>": 19950, "cham</w>": 19951, "condo</w>": 19952, "flowering</w>": 19953, "alec</w>": 19954, "dms</w>": 19955, "intensity</w>": 19956, "Â¨": 19957, "mccartney</w>": 19958, "islamabad</w>": 19959, "kb</w>": 19960, "ffi": 19961, "phal": 19962, "analog</w>": 19963, "fond</w>": 19964, "hacks</w>": 19965, "positivity</w>": 19966, "treaty</w>": 19967, "submarine</w>": 19968, "connect</w>": 19969, "selen": 19970, "categories</w>": 19971, "cub</w>": 19972, "organize</w>": 19973, "sik": 19974, "quoteoftheday</w>": 19975, "reminding</w>": 19976, "amor": 19977, "locking</w>": 19978, "ðŁĳıðŁı¼</w>": 19979, "compound</w>": 19980, "ette": 19981, "bout": 19982, "recur": 19983, "ference</w>": 19984, "mizz": 19985, "trend": 19986, "hipster</w>": 19987, "fortress</w>": 19988, "forthcoming</w>": 19989, "prelimin": 19990, "odyssey</w>": 19991, "angp</w>": 19992, "delici": 19993, "evenings</w>": 19994, "ðŁĶ¹</w>": 19995, "iq</w>": 19996, "dw</w>": 19997, "dair": 19998, "kathryn</w>": 19999, "christianity</w>": 20000, "moonlight</w>": 20001, "hab</w>": 20002, "whoo": 20003, "fbf</w>": 20004, "seth": 20005, "genuinely</w>": 20006, "pax": 20007, "charity": 20008, "deployed</w>": 20009, "bnb</w>": 20010, "bucs</w>": 20011, "judg": 20012, "conge": 20013, "plantation</w>": 20014, "impress</w>": 20015, "cara</w>": 20016, "sclub</w>": 20017, "scopy</w>": 20018, "landers</w>": 20019, "complaints</w>": 20020, "bama</w>": 20021, "rebuild</w>": 20022, "xy": 20023, "realism</w>": 20024, "shour</w>": 20025, "lein": 20026, "bracelets</w>": 20027, "mera</w>": 20028, "assassin</w>": 20029, "anchor": 20030, "ðŁĳĮðŁı¼</w>": 20031, "linen</w>": 20032, "confron": 20033, "chronicle</w>": 20034, "comment": 20035, "catalog</w>": 20036, "illes</w>": 20037, "gorge</w>": 20038, "metry</w>": 20039, "jungkook</w>": 20040, "lovemy": 20041, "sentin": 20042, "seem": 20043, "fitness": 20044, "allied</w>": 20045, "tsman</w>": 20046, "digitaltransformation</w>": 20047, "pran": 20048, "loft</w>": 20049, "minton</w>": 20050, "aldenrichards</w>": 20051, "envel": 20052, "cherish</w>": 20053, "certainty</w>": 20054, "zzz</w>": 20055, "rhino</w>": 20056, "perkins</w>": 20057, "enrich": 20058, "capetown</w>": 20059, "ometer</w>": 20060, "sections</w>": 20061, "skeleton</w>": 20062, "defenders</w>": 20063, "ðŁĺĿ": 20064, "penc": 20065, "brit</w>": 20066, "jah": 20067, "capitalism</w>": 20068, "ðŁ¥ĩ</w>": 20069, "bazaar</w>": 20070, "reme": 20071, "ext</w>": 20072, "kkk</w>": 20073, "convert</w>": 20074, "stormy</w>": 20075, "bye": 20076, "karan": 20077, "chrysler</w>": 20078, "ados</w>": 20079, "pressed</w>": 20080, "sync</w>": 20081, "ationday</w>": 20082, "danger": 20083, "badges</w>": 20084, "refuses</w>": 20085, "empowering</w>": 20086, "lym": 20087, "exports</w>": 20088, "adoptdontshop</w>": 20089, "ðŁĩ¯": 20090, "thc</w>": 20091, "awaited</w>": 20092, "focuses</w>": 20093, "fined</w>": 20094, "oat": 20095, "hahahah</w>": 20096, "âģ©": 20097, "nfamily</w>": 20098, "fiona</w>": 20099, "luckily</w>": 20100, "thrilling</w>": 20101, "typing</w>": 20102, "outbreak</w>": 20103, "dies": 20104, "heu": 20105, "crawl</w>": 20106, "nesses</w>": 20107, "oath</w>": 20108, "scripts</w>": 20109, "geeks</w>": 20110, "ðŁĲĿ</w>": 20111, "pb": 20112, "mathematics</w>": 20113, "alis</w>": 20114, "________________": 20115, "gymnastics</w>": 20116, "activism</w>": 20117, "recommendation</w>": 20118, "gren</w>": 20119, "wain</w>": 20120, "courty": 20121, "napol": 20122, "cauli": 20123, "hornets</w>": 20124, "gals</w>": 20125, "jockey</w>": 20126, "dirty": 20127, "atar": 20128, "enormous</w>": 20129, "pest": 20130, "gregation</w>": 20131, "anos</w>": 20132, "iiii": 20133, "defends</w>": 20134, "blackhistorymonth</w>": 20135, "atx</w>": 20136, "mbc</w>": 20137, "luggage</w>": 20138, "witch": 20139, "cob": 20140, "lasts</w>": 20141, "cum": 20142, "ggg</w>": 20143, "bathing</w>": 20144, "nar</w>": 20145, "cebu</w>": 20146, "ðŁįĥ</w>": 20147, "navigation</w>": 20148, "mine": 20149, "rejo": 20150, "ðŁİĢ</w>": 20151, "giftide": 20152, "reta": 20153, "useless</w>": 20154, "pull": 20155, "deficit</w>": 20156, "allu": 20157, "atime</w>": 20158, "itv": 20159, "trillion</w>": 20160, "pue": 20161, "acies</w>": 20162, "procedure</w>": 20163, "lori": 20164, "jenny": 20165, "cad</w>": 20166, "ulously</w>": 20167, "drac": 20168, "promotes</w>": 20169, "ingthe": 20170, "canu": 20171, "woohoo</w>": 20172, "naomi</w>": 20173, "zardari</w>": 20174, "tsu</w>": 20175, "beir": 20176, "sdg</w>": 20177, "lever": 20178, "weber</w>": 20179, "abud": 20180, "lund</w>": 20181, "crowded</w>": 20182, "deployment</w>": 20183, "terrain</w>": 20184, "kenny": 20185, "hof": 20186, "witnessed</w>": 20187, "loch": 20188, "jk": 20189, "bully</w>": 20190, "wren": 20191, "poetry": 20192, "doff</w>": 20193, "wwi</w>": 20194, "mored</w>": 20195, "dini</w>": 20196, "culture": 20197, "prompt</w>": 20198, "Â¥</w>": 20199, "maurice</w>": 20200, "topps</w>": 20201, "rm": 20202, "correspon": 20203, "about": 20204, "jewels</w>": 20205, "gibr": 20206, "eagle": 20207, "ðŁĺĺðŁĺĺðŁĺĺ</w>": 20208, "lending</w>": 20209, "souven": 20210, "çĶ": 20211, "contemporaryart</w>": 20212, "establishment</w>": 20213, "jong": 20214, "âĢ¦\"</w>": 20215, "gator": 20216, "patriotic</w>": 20217, "mccoy</w>": 20218, "vape</w>": 20219, "humane</w>": 20220, "feliz</w>": 20221, "coachella</w>": 20222, "reposting</w>": 20223, "steals</w>": 20224, "fuller</w>": 20225, "nering</w>": 20226, "atra": 20227, "(-</w>": 20228, "blake": 20229, "heather": 20230, "worms</w>": 20231, "disciplinary</w>": 20232, "redemption</w>": 20233, "yard": 20234, "amin</w>": 20235, "\"@_</w>": 20236, "dnc</w>": 20237, "tds</w>": 20238, "kappa</w>": 20239, "newark</w>": 20240, "commits</w>": 20241, "spears</w>": 20242, "jams</w>": 20243, "tand": 20244, "msnbc</w>": 20245, "intermedi": 20246, "aimed</w>": 20247, "atic": 20248, "teenth</w>": 20249, "observation</w>": 20250, "kashmir": 20251, "kavanaugh</w>": 20252, "oul": 20253, "sanfrancisco</w>": 20254, "reu": 20255, "belated</w>": 20256, "chow": 20257, "password</w>": 20258, "stills</w>": 20259, "detained</w>": 20260, "sari</w>": 20261, "dayton</w>": 20262, "darren": 20263, "italian": 20264, "arth</w>": 20265, "amusic</w>": 20266, "arbit": 20267, "wm": 20268, "vm</w>": 20269, "hem": 20270, "doug": 20271, "myr": 20272, "asho": 20273, "prev": 20274, "vind</w>": 20275, "brah": 20276, "stag</w>": 20277, "à¸µ</w>": 20278, "previews</w>": 20279, "guk</w>": 20280, "containing</w>": 20281, "leonardo</w>": 20282, "saddle</w>": 20283, "rushing</w>": 20284, "stav": 20285, "longh": 20286, "gambling</w>": 20287, "vegas": 20288, "reservation</w>": 20289, "endale</w>": 20290, "bala</w>": 20291, "fla</w>": 20292, "variant</w>": 20293, "hedge</w>": 20294, "bulgaria</w>": 20295, "natali": 20296, "weaver</w>": 20297, "solst": 20298, "encouraged</w>": 20299, "apc</w>": 20300, "asparag": 20301, "nest": 20302, "cyclists</w>": 20303, "fel</w>": 20304, "ìĬ¤": 20305, "overwhelming</w>": 20306, "peyton</w>": 20307, "jit</w>": 20308, "apost": 20309, "mble": 20310, "bleeding</w>": 20311, "neighbourhood</w>": 20312, "avery</w>": 20313, "expressions</w>": 20314, "macdonald</w>": 20315, "gigs</w>": 20316, "monds</w>": 20317, "illusion</w>": 20318, "nct</w>": 20319, "camero": 20320, "overhead</w>": 20321, "myth": 20322, "oly": 20323, "vio</w>": 20324, "etv</w>": 20325, "laurie</w>": 20326, "unveiling</w>": 20327, "prior": 20328, "conn</w>": 20329, "ironman</w>": 20330, "diff</w>": 20331, "dayin": 20332, "critici": 20333, "congo</w>": 20334, "revision</w>": 20335, "wale</w>": 20336, "director": 20337, "pines</w>": 20338, "blackpink</w>": 20339, "garner</w>": 20340, "curated</w>": 20341, "manitoba</w>": 20342, "hac": 20343, "commonly</w>": 20344, "barton</w>": 20345, "....#</w>": 20346, "mortality</w>": 20347, "livesmatter</w>": 20348, "philosop": 20349, "shorter</w>": 20350, "convince</w>": 20351, "freak": 20352, "vendors</w>": 20353, "insightful</w>": 20354, "elly</w>": 20355, "sensors</w>": 20356, "eled</w>": 20357, "sberg</w>": 20358, "weightloss</w>": 20359, "ukip</w>": 20360, "spur</w>": 20361, "private": 20362, "qua</w>": 20363, "ssc</w>": 20364, ",...</w>": 20365, "supervisor</w>": 20366, "adviser</w>": 20367, "amazingly</w>": 20368, "lesser</w>": 20369, "ates": 20370, "mahon</w>": 20371, "oooooo</w>": 20372, "saras": 20373, "pmoindia</w>": 20374, "waffle</w>": 20375, "unders</w>": 20376, "tolerance</w>": 20377, "sculptures</w>": 20378, "hersh": 20379, "knocking</w>": 20380, "smoke": 20381, "catholic": 20382, "grim": 20383, "traveled</w>": 20384, "flip": 20385, "geoff</w>": 20386, "dinosaurs</w>": 20387, "slept</w>": 20388, "scarlet</w>": 20389, "oki</w>": 20390, "complaint</w>": 20391, "obsc": 20392, "nami": 20393, "lag</w>": 20394, "crossfit</w>": 20395, "ufc": 20396, "mccain</w>": 20397, "referee</w>": 20398, "sadness</w>": 20399, "penny": 20400, "lieu": 20401, "mode": 20402, "kier": 20403, "vols</w>": 20404, "wis</w>": 20405, "elon</w>": 20406, "shea</w>": 20407, "bao</w>": 20408, "sonia</w>": 20409, "claire": 20410, "emmanuel</w>": 20411, "moisture</w>": 20412, "digest</w>": 20413, "viii</w>": 20414, "teller</w>": 20415, "chon": 20416, "accessory</w>": 20417, "nightclub</w>": 20418, "fossil": 20419, "awan</w>": 20420, "husky</w>": 20421, "aboriginal</w>": 20422, "brandon": 20423, "fficient</w>": 20424, "cougars</w>": 20425, "sted": 20426, "admitted</w>": 20427, "ignored</w>": 20428, "contentmarketing</w>": 20429, "agas": 20430, "vase</w>": 20431, "executed</w>": 20432, "negotiations</w>": 20433, "shead</w>": 20434, "nand": 20435, "tablets</w>": 20436, "goth</w>": 20437, "tsal</w>": 20438, "dfw</w>": 20439, "onep": 20440, "protector</w>": 20441, "spho": 20442, "gazette</w>": 20443, "andreas</w>": 20444, "sser</w>": 20445, "compilation</w>": 20446, "hav</w>": 20447, "containers</w>": 20448, "broker</w>": 20449, "socal</w>": 20450, "porcelain</w>": 20451, "hyuk</w>": 20452, "airing</w>": 20453, "ðŁĴ°": 20454, "publisher</w>": 20455, "scenario</w>": 20456, "spartans</w>": 20457, "reviewing</w>": 20458, "itudes</w>": 20459, "edel": 20460, "pearson</w>": 20461, "bash": 20462, "maui</w>": 20463, "aad": 20464, "ðŁĮĬ": 20465, "liu</w>": 20466, "ulate</w>": 20467, "programmes</w>": 20468, "favour</w>": 20469, "webdesign</w>": 20470, "realty</w>": 20471, "motivational</w>": 20472, "crosses</w>": 20473, "'...</w>": 20474, "busch</w>": 20475, "adjustable</w>": 20476, "arjun</w>": 20477, "mistak": 20478, "dimension</w>": 20479, "pistol</w>": 20480, "weighs</w>": 20481, "eny</w>": 20482, "unveil</w>": 20483, "indycar</w>": 20484, "gordon": 20485, "fade</w>": 20486, "franken": 20487, "qualities</w>": 20488, "bett</w>": 20489, "locate</w>": 20490, "kerr</w>": 20491, "spc</w>": 20492, "confusion</w>": 20493, "nee": 20494, "lucky": 20495, "bases</w>": 20496, "depends</w>": 20497, "firefighter</w>": 20498, "ola": 20499, "ret": 20500, "maroon</w>": 20501, "ðŁĶĬ</w>": 20502, "wam": 20503, "defining</w>": 20504, "wheat": 20505, "bil</w>": 20506, "Ã©s</w>": 20507, "bhai</w>": 20508, "psych</w>": 20509, "tau</w>": 20510, "icans</w>": 20511, "thik</w>": 20512, "obile</w>": 20513, "inspector</w>": 20514, "ìĨĮë": 20515, "illon</w>": 20516, "gos": 20517, "evangel": 20518, "fai": 20519, "sist</w>": 20520, "vocation</w>": 20521, "burge": 20522, "chistan</w>": 20523, "renewed</w>": 20524, "enthusiasm</w>": 20525, "enting</w>": 20526, "agri": 20527, "ikea</w>": 20528, "msc</w>": 20529, "aerospace</w>": 20530, "sensiti": 20531, "memoir</w>": 20532, "hospice</w>": 20533, "cocaine</w>": 20534, "derry</w>": 20535, "mechanics</w>": 20536, "Ħà¸": 20537, "tino</w>": 20538, "reduces</w>": 20539, "collectors</w>": 20540, "injustice</w>": 20541, "suppre": 20542, "vana</w>": 20543, "abun": 20544, "napa</w>": 20545, "susa</w>": 20546, "oslo</w>": 20547, "eff": 20548, "encore</w>": 20549, "licence</w>": 20550, "cheddar</w>": 20551, "zal": 20552, "mount": 20553, "ðŁĴĲ</w>": 20554, "threatens</w>": 20555, "!!\"</w>": 20556, "archie</w>": 20557, "futsal</w>": 20558, "scuba</w>": 20559, "jos": 20560, "gnon</w>": 20561, "sexi": 20562, "sofficial</w>": 20563, "comparing</w>": 20564, "dominant</w>": 20565, "toftheday</w>": 20566, "fait</w>": 20567, "proposals</w>": 20568, "gift": 20569, "yas</w>": 20570, "cnc</w>": 20571, "lr": 20572, "hab": 20573, "reservoir</w>": 20574, "beliefs</w>": 20575, "general": 20576, "marti": 20577, "td": 20578, "este</w>": 20579, "ìł": 20580, "wil</w>": 20581, "ðŁĳ¯</w>": 20582, "ðŁĶ«</w>": 20583, "spx</w>": 20584, "etwork</w>": 20585, "excerpt</w>": 20586, "einstein</w>": 20587, "hiro</w>": 20588, "silhou": 20589, "teamed</w>": 20590, "perception</w>": 20591, "corridor</w>": 20592, "mentalhealth": 20593, "hints</w>": 20594, "benny</w>": 20595, "inducted</w>": 20596, "swx</w>": 20597, "widesp": 20598, "speak": 20599, "cheryl</w>": 20600, "drug": 20601, "ðŁĺķ</w>": 20602, "hf</w>": 20603, "asparagus</w>": 20604, "mysteries</w>": 20605, "fitzgerald</w>": 20606, "offer": 20607, "therapist</w>": 20608, "career": 20609, "damaging</w>": 20610, "tsd</w>": 20611, "peru": 20612, "weibo</w>": 20613, "yay": 20614, "phoenix": 20615, "discre": 20616, "macbook</w>": 20617, "barker</w>": 20618, "stigma</w>": 20619, "spread": 20620, "rockies</w>": 20621, "kangar": 20622, "bridg": 20623, "pai": 20624, "bishop": 20625, "tailed</w>": 20626, "capsule</w>": 20627, "ðŁĴĵ": 20628, "geof": 20629, "royale</w>": 20630, "shortlisted</w>": 20631, "oste": 20632, "ashamed</w>": 20633, "chapp": 20634, "keye</w>": 20635, "cla</w>": 20636, "screenshot": 20637, "austrian</w>": 20638, "native": 20639, "enight</w>": 20640, "juliet</w>": 20641, "michele</w>": 20642, "ðŁĮ´": 20643, "travelers</w>": 20644, "pil</w>": 20645, "footballer</w>": 20646, "winchester</w>": 20647, "ðŁĻĦ": 20648, "azerbai": 20649, "goldeng": 20650, "organisations</w>": 20651, "interpretation</w>": 20652, "predator</w>": 20653, "oftheweek</w>": 20654, "logan": 20655, "pokÃ©": 20656, "marie": 20657, "calla": 20658, "tnt</w>": 20659, "cinde": 20660, "getic</w>": 20661, "fitfam</w>": 20662, "grav": 20663, "owens</w>": 20664, "ðŁĮ±</w>": 20665, "shootout</w>": 20666, "salis": 20667, "commissions</w>": 20668, "cohe": 20669, "ptic</w>": 20670, "nixon</w>": 20671, "hia</w>": 20672, "ambition</w>": 20673, "marine": 20674, "cruelty</w>": 20675, "tk</w>": 20676, "crude</w>": 20677, "salty</w>": 20678, "jima</w>": 20679, "mongo": 20680, "irony</w>": 20681, "onwards</w>": 20682, "arrests</w>": 20683, "strangers</w>": 20684, "iger</w>": 20685, "cyclist</w>": 20686, "rag</w>": 20687, "extends</w>": 20688, "tradio</w>": 20689, "bourg</w>": 20690, "moi": 20691, "ella": 20692, "eable</w>": 20693, "lexus</w>": 20694, "aul": 20695, "dera</w>": 20696, "historian</w>": 20697, "morton</w>": 20698, "tiff</w>": 20699, "manner</w>": 20700, "kot</w>": 20701, "dk": 20702, "pointed</w>": 20703, "marqu": 20704, "aan": 20705, "eney</w>": 20706, "dublin": 20707, "onpoli</w>": 20708, "emili": 20709, "secret": 20710, "flo</w>": 20711, "âļ¡</w>": 20712, "baj": 20713, "steep</w>": 20714, "accompanied</w>": 20715, "rumours</w>": 20716, "devi</w>": 20717, "purchasing</w>": 20718, "fig</w>": 20719, "pub": 20720, "schoo": 20721, "autonomous</w>": 20722, "goalie</w>": 20723, "xia</w>": 20724, "automatically</w>": 20725, "revers": 20726, "tero": 20727, "fuku": 20728, "titanic</w>": 20729, "shook</w>": 20730, "sandals</w>": 20731, "seekers</w>": 20732, "excav": 20733, "nordic</w>": 20734, "bigolive</w>": 20735, "bake": 20736, "ratt": 20737, "zak</w>": 20738, "nep": 20739, "ðŁĺ¤</w>": 20740, "candy": 20741, "billions</w>": 20742, "bookworm</w>": 20743, "ppet</w>": 20744, "à³": 20745, "surfaces</w>": 20746, "scars</w>": 20747, "philip": 20748, "dogg</w>": 20749, "cigars</w>": 20750, "cote</w>": 20751, "translated</w>": 20752, "curator</w>": 20753, "sindh</w>": 20754, "hangover</w>": 20755, "brewer</w>": 20756, "ones": 20757, "elton</w>": 20758, "ðŁĴªðŁı¼</w>": 20759, "marcu": 20760, "elliot</w>": 20761, "righte": 20762, "dioce": 20763, "russ</w>": 20764, "railways</w>": 20765, "grandson</w>": 20766, "ascen": 20767, "apology</w>": 20768, "await</w>": 20769, "mobili": 20770, "respir": 20771, "partisan</w>": 20772, "olivi": 20773, "strike": 20774, "yoo</w>": 20775, "whitehouse</w>": 20776, "expressed</w>": 20777, "pups</w>": 20778, "bedford</w>": 20779, "cultur": 20780, "frogs</w>": 20781, "flying": 20782, "cavali": 20783, "cds</w>": 20784, "friger": 20785, "streetphotography</w>": 20786, "resolve</w>": 20787, "taliban</w>": 20788, "kang</w>": 20789, "crushing</w>": 20790, "jum": 20791, "ðŁĺĴ": 20792, "williamson</w>": 20793, "tang</w>": 20794, "curly</w>": 20795, "tman</w>": 20796, "veteran": 20797, "faire</w>": 20798, "artificialintelligence</w>": 20799, "unanim": 20800, "pren": 20801, "backdrop</w>": 20802, "frances</w>": 20803, "occer</w>": 20804, "dorothy</w>": 20805, "working": 20806, "arthr": 20807, "converted</w>": 20808, "daylight</w>": 20809, "servant</w>": 20810, "paddle</w>": 20811, "complaining</w>": 20812, "thirty</w>": 20813, "nadal</w>": 20814, "aku</w>": 20815, "ibrahim</w>": 20816, "addressed</w>": 20817, "piss</w>": 20818, "greenhouse</w>": 20819, "battalion</w>": 20820, "simulator</w>": 20821, "outlets</w>": 20822, "embroidery</w>": 20823, "ðŁĵ±</w>": 20824, "fiscal</w>": 20825, "gerard</w>": 20826, "sassy</w>": 20827, "ðŁİīðŁİīðŁİī</w>": 20828, "ventures</w>": 20829, "merit</w>": 20830, "publicity</w>": 20831, "ðŁĳĪ</w>": 20832, "sophisticated</w>": 20833, "ctu": 20834, "conventional</w>": 20835, "condolences</w>": 20836, "israel": 20837, "tradition": 20838, "aran": 20839, "tess</w>": 20840, "glad": 20841, "ðŁĺĬðŁĺĬ</w>": 20842, "correction</w>": 20843, "geon": 20844, "amd</w>": 20845, "orship</w>": 20846, "beast": 20847, "chment</w>": 20848, "ìŀ": 20849, "nico": 20850, "wknd</w>": 20851, "wels</w>": 20852, "cushion</w>": 20853, "belie": 20854, "voc</w>": 20855, "idiots</w>": 20856, "underneath</w>": 20857, "puma</w>": 20858, "cornell</w>": 20859, "enation</w>": 20860, "lul": 20861, "swach": 20862, "abig": 20863, "urer</w>": 20864, "mie": 20865, "formerly</w>": 20866, "caf</w>": 20867, "ernal</w>": 20868, "chorus</w>": 20869, "julius</w>": 20870, "senator": 20871, "âľį": 20872, "whir": 20873, "salvador</w>": 20874, "phd": 20875, "unified</w>": 20876, "booster</w>": 20877, "graphical</w>": 20878, "wrec": 20879, "sonny</w>": 20880, "miz": 20881, "derers</w>": 20882, "sall</w>": 20883, "vens</w>": 20884, "tuscany</w>": 20885, "wid</w>": 20886, "yong</w>": 20887, "kurds</w>": 20888, "waz": 20889, "trolls</w>": 20890, "macro": 20891, "caturday</w>": 20892, "pressing</w>": 20893, "sasha</w>": 20894, "centennial</w>": 20895, "gusts</w>": 20896, "emc": 20897, "before": 20898, "denise</w>": 20899, "cust": 20900, "ðŁĵ¢</w>": 20901, "looo": 20902, "basel</w>": 20903, "england": 20904, "yolo</w>": 20905, "ardu": 20906, "manifesto</w>": 20907, "doha</w>": 20908, "ìľ": 20909, "knives</w>": 20910, "bournemouth</w>": 20911, "bibl": 20912, "barb</w>": 20913, "alicia</w>": 20914, "Ø©</w>": 20915, "comer</w>": 20916, "cyclone</w>": 20917, "git</w>": 20918, "anews</w>": 20919, "characteri": 20920, "ventura</w>": 20921, "intra": 20922, "sfgiants</w>": 20923, "hut": 20924, "bea</w>": 20925, "darwin</w>": 20926, "eller": 20927, "alv": 20928, "reese</w>": 20929, "bly": 20930, "karan</w>": 20931, "conclusion</w>": 20932, "manny</w>": 20933, "flakes</w>": 20934, "uniteblue</w>": 20935, "nadu</w>": 20936, "copp": 20937, "edges</w>": 20938, "lancashire</w>": 20939, "ials</w>": 20940, "otta</w>": 20941, "philippe</w>": 20942, "lent": 20943, "chee</w>": 20944, "mentors</w>": 20945, "festival": 20946, "anism</w>": 20947, "complimentary</w>": 20948, "rj</w>": 20949, "pug": 20950, "dine": 20951, "wei</w>": 20952, "cliffs</w>": 20953, "sarmy</w>": 20954, "tiveness</w>": 20955, "treasury</w>": 20956, "iland</w>": 20957, "aftermath</w>": 20958, "rabbi</w>": 20959, "oun</w>": 20960, "bouquet</w>": 20961, "heritage": 20962, "zion</w>": 20963, "surrender</w>": 20964, "shenan": 20965, "inks</w>": 20966, "karl": 20967, "ghty": 20968, "policing</w>": 20969, "examination</w>": 20970, "cey</w>": 20971, "persu": 20972, "measurement</w>": 20973, "hydrogen</w>": 20974, "luhan</w>": 20975, "âłĢâłĢâłĢâłĢ": 20976, "wari</w>": 20977, "Ð¾Ð": 20978, "jy": 20979, "fowler</w>": 20980, "mish</w>": 20981, "alfre": 20982, "âĺĳ": 20983, "bbnaija</w>": 20984, "catalogue</w>": 20985, "recognised</w>": 20986, "saver</w>": 20987, "huskies</w>": 20988, "colin": 20989, "mundo</w>": 20990, "siva</w>": 20991, "png</w>": 20992, "discounted</w>": 20993, "manutd</w>": 20994, "fresno</w>": 20995, "devin</w>": 20996, "preliminary</w>": 20997, "trophies</w>": 20998, "plastics</w>": 20999, "dug</w>": 21000, "procu": 21001, "indigo</w>": 21002, "gard</w>": 21003, "dylan": 21004, "pitches</w>": 21005, "groundbreaking</w>": 21006, "inson</w>": 21007, "blac": 21008, "anthology</w>": 21009, "fh</w>": 21010, "explic": 21011, "rard</w>": 21012, "admiral</w>": 21013, "sochi</w>": 21014, "lashes</w>": 21015, "splendid</w>": 21016, "envy</w>": 21017, "adv</w>": 21018, "sexy": 21019, "festivities</w>": 21020, "sticking</w>": 21021, "bib</w>": 21022, "thrill</w>": 21023, "opp</w>": 21024, "ariel</w>": 21025, "botanical</w>": 21026, "endurance</w>": 21027, "females</w>": 21028, "bricks</w>": 21029, "vatican</w>": 21030, "blackpool</w>": 21031, "bermu": 21032, "brough</w>": 21033, "roller": 21034, "bid": 21035, "suede</w>": 21036, "slovenia</w>": 21037, "mming": 21038, "mlb": 21039, "medalist</w>": 21040, "dians</w>": 21041, "rehabilitation</w>": 21042, "neon": 21043, "sgo</w>": 21044, "lithu": 21045, "ramos</w>": 21046, "zed": 21047, "pianist</w>": 21048, "intensive</w>": 21049, "broadband</w>": 21050, "study": 21051, "petersburg</w>": 21052, "luca</w>": 21053, "ahhhh</w>": 21054, "physician</w>": 21055, "dillon</w>": 21056, "telecom</w>": 21057, "grief</w>": 21058, "mun</w>": 21059, "acro": 21060, "sided</w>": 21061, "sly</w>": 21062, "blows</w>": 21063, "classiccars</w>": 21064, "trium": 21065, "argy": 21066, "?:</w>": 21067, "hri": 21068, "marshmal": 21069, "âĢĵ": 21070, "topping</w>": 21071, "warsaw</w>": 21072, "transc": 21073, "preservation</w>": 21074, "bav": 21075, "refriger": 21076, "experiments</w>": 21077, "äº": 21078, "glit": 21079, "sliga</w>": 21080, "gage</w>": 21081, "factor": 21082, "flavours</w>": 21083, "brony</w>": 21084, "spo</w>": 21085, "cookbook</w>": 21086, "carriage</w>": 21087, "away": 21088, "nyfw</w>": 21089, "onian</w>": 21090, "wg": 21091, "simpsons</w>": 21092, "rolex</w>": 21093, "ðŁı¿</w>": 21094, "crosby</w>": 21095, "ãħ¤": 21096, "credi": 21097, "syndic": 21098, "pubs</w>": 21099, "alife</w>": 21100, "poorly</w>": 21101, "maced": 21102, "ðŁĺŀ</w>": 21103, "behindthe": 21104, "wenger</w>": 21105, "nats</w>": 21106, "ðŁİŁ</w>": 21107, "rubbish</w>": 21108, "procedures</w>": 21109, "typhoon</w>": 21110, "ophobia</w>": 21111, "erdo": 21112, "fuel": 21113, "viera</w>": 21114, "bumps</w>": 21115, "millennium</w>": 21116, "newzealand</w>": 21117, "lectures</w>": 21118, "iton</w>": 21119, "milky</w>": 21120, "responded</w>": 21121, "ê°": 21122, "landscape": 21123, "..@</w>": 21124, "bother</w>": 21125, "âĸ¶</w>": 21126, "zhang</w>": 21127, "huawei</w>": 21128, "tuition</w>": 21129, "sworn</w>": 21130, "inu": 21131, "yor</w>": 21132, "paolo</w>": 21133, "auditions</w>": 21134, "abil": 21135, "malaysian</w>": 21136, "hops</w>": 21137, "feathers</w>": 21138, "mple</w>": 21139, "auts</w>": 21140, "Ã£o</w>": 21141, "bounty</w>": 21142, "iche</w>": 21143, "ìĺ": 21144, "shq</w>": 21145, "pinot</w>": 21146, "gears</w>": 21147, "disappear": 21148, "videogames</w>": 21149, "tna</w>": 21150, "alzheimer</w>": 21151, "ðŁĮŀ": 21152, "aji</w>": 21153, "underwear</w>": 21154, "switching</w>": 21155, "signage</w>": 21156, "oscar": 21157, "econ</w>": 21158, "drow": 21159, "clint</w>": 21160, "plated</w>": 21161, "gundy</w>": 21162, "emblem</w>": 21163, "hoes</w>": 21164, "icist</w>": 21165, "nelly</w>": 21166, "junior": 21167, "roadshow</w>": 21168, "minerals</w>": 21169, "atle": 21170, "alexandria</w>": 21171, "acclaimed</w>": 21172, "vell": 21173, "shiva</w>": 21174, "adhe": 21175, "enne": 21176, "amnesty</w>": 21177, "hounds</w>": 21178, "councillor</w>": 21179, "ðŁĴ¦": 21180, "aesthe": 21181, "partnering</w>": 21182, "influenced</w>": 21183, "magno": 21184, "flare</w>": 21185, "extinction</w>": 21186, "civilian</w>": 21187, "majesty</w>": 21188, "vail</w>": 21189, "lawmakers</w>": 21190, "racks</w>": 21191, "mcc</w>": 21192, "orian</w>": 21193, "spices</w>": 21194, "errors</w>": 21195, "mayer</w>": 21196, "coca</w>": 21197, "pai</w>": 21198, "sooooo</w>": 21199, "retiring</w>": 21200, "bathro": 21201, "ðŁĻĮðŁĻĮ": 21202, "âĸª": 21203, "suf": 21204, "endorsement</w>": 21205, "building": 21206, "brooch</w>": 21207, "palla": 21208, "arvind": 21209, "agent": 21210, "karate</w>": 21211, "rhi": 21212, "ctv": 21213, "taine": 21214, "umm</w>": 21215, "bax": 21216, "reigns</w>": 21217, "uniof": 21218, "enterprises</w>": 21219, "adele</w>": 21220, "flake</w>": 21221, "attire</w>": 21222, "bruce": 21223, "bahamas</w>": 21224, "gravy</w>": 21225, "sain": 21226, "cheek</w>": 21227, "trivi": 21228, "lov</w>": 21229, "een</w>": 21230, "bblo": 21231, "ladygaga</w>": 21232, "itta</w>": 21233, ".\"-</w>": 21234, "dustin</w>": 21235, "observatory</w>": 21236, "eighth</w>": 21237, "bloomberg</w>": 21238, "khs</w>": 21239, "fcc</w>": 21240, "gist</w>": 21241, "commemorate</w>": 21242, "veer": 21243, "sexuality</w>": 21244, "edc</w>": 21245, "nicole": 21246, "vacancy</w>": 21247, "user": 21248, "sona</w>": 21249, ":'(</w>": 21250, "diploma</w>": 21251, "tend</w>": 21252, "upgrades</w>": 21253, "ÅŁ": 21254, "jurassic</w>": 21255, "cardiac</w>": 21256, "drs</w>": 21257, "widespread</w>": 21258, "Ãł</w>": 21259, "dailies</w>": 21260, "vendor</w>": 21261, "simplicity</w>": 21262, "wider</w>": 21263, "lenses</w>": 21264, "supplements</w>": 21265, "depos": 21266, "observed</w>": 21267, "vines</w>": 21268, "partially</w>": 21269, "renewal</w>": 21270, "collaborate</w>": 21271, "alig": 21272, "finity</w>": 21273, "phu": 21274, "zzy": 21275, "petit</w>": 21276, "ðŁĵħ</w>": 21277, "zin</w>": 21278, "igu": 21279, "smack": 21280, "fallon</w>": 21281, "ðŁĵ£</w>": 21282, "backwards</w>": 21283, "component</w>": 21284, "oso</w>": 21285, "compatible</w>": 21286, "binding</w>": 21287, "zurich</w>": 21288, "thome</w>": 21289, "wounds</w>": 21290, "lyric</w>": 21291, "freshmen</w>": 21292, "sneaky</w>": 21293, "fibro": 21294, "diet": 21295, "employer</w>": 21296, "insect</w>": 21297, "hated</w>": 21298, "scher</w>": 21299, "razor</w>": 21300, "nsw": 21301, "booker</w>": 21302, "californi": 21303, "avfc</w>": 21304, "Â°": 21305, "pretending</w>": 21306, "pepsi</w>": 21307, "alis": 21308, "untitled</w>": 21309, "kart</w>": 21310, "grandparents</w>": 21311, "ethe": 21312, "ock</w>": 21313, "luxemb": 21314, "visuals</w>": 21315, "smallbusiness</w>": 21316, "abdullah</w>": 21317, "minho</w>": 21318, "subaru</w>": 21319, "hra": 21320, "revealing</w>": 21321, "heartbreaking</w>": 21322, "clarity</w>": 21323, "amg</w>": 21324, "slr</w>": 21325, "****": 21326, "âŀĸ": 21327, "record": 21328, "iciary</w>": 21329, "minded</w>": 21330, "yeh</w>": 21331, "excessive</w>": 21332, "knuck": 21333, "icecream</w>": 21334, "truth": 21335, "evic": 21336, "tastic</w>": 21337, "antarc": 21338, "rendering</w>": 21339, ",,": 21340, "mitt</w>": 21341, "lorenzo</w>": 21342, "stpatrick": 21343, "boundary</w>": 21344, "zig</w>": 21345, "vocab": 21346, "osaka</w>": 21347, "furn": 21348, "tun</w>": 21349, "gul</w>": 21350, "sounding</w>": 21351, "blogger": 21352, "utterly</w>": 21353, "gaf": 21354, "advancing</w>": 21355, "lcd</w>": 21356, "margin</w>": 21357, "lifelong</w>": 21358, "solstice</w>": 21359, "shra": 21360, "waits</w>": 21361, "plear": 21362, "breach</w>": 21363, "enligh": 21364, "ader</w>": 21365, "ittle</w>": 21366, "cation</w>": 21367, "hoon</w>": 21368, "studied</w>": 21369, "?????</w>": 21370, "kash</w>": 21371, "evangeli": 21372, "psl</w>": 21373, "weights</w>": 21374, "metals</w>": 21375, "tyres</w>": 21376, "turno": 21377, "wie": 21378, "carb</w>": 21379, "gale</w>": 21380, "seal": 21381, "sunite</w>": 21382, "amic</w>": 21383, "patterson</w>": 21384, "Ã¡n</w>": 21385, "euph": 21386, "upstairs</w>": 21387, "qualifiers</w>": 21388, "khalifa</w>": 21389, "applemusic</w>": 21390, "ìĨĮëħ": 21391, "vaughan</w>": 21392, "alter</w>": 21393, "cruiser</w>": 21394, "mua</w>": 21395, "tana</w>": 21396, "katrina</w>": 21397, "idols</w>": 21398, "spoiled</w>": 21399, "secretly</w>": 21400, "fibre</w>": 21401, "partnered</w>": 21402, "umes</w>": 21403, "giov": 21404, "comet</w>": 21405, "screenshotsaturday</w>": 21406, "keller</w>": 21407, "filtr": 21408, "fet": 21409, "conway</w>": 21410, "peu": 21411, "badminton</w>": 21412, "gid</w>": 21413, "mound</w>": 21414, "donkey</w>": 21415, "buff</w>": 21416, "leather": 21417, "largely</w>": 21418, "broch": 21419, "intments</w>": 21420, "amuse": 21421, "rk</w>": 21422, "stove</w>": 21423, "impacted</w>": 21424, "cont</w>": 21425, "cracks</w>": 21426, "prisoner</w>": 21427, "bari": 21428, "contractor</w>": 21429, "orioles</w>": 21430, "dominate</w>": 21431, "polar": 21432, "amelia</w>": 21433, "drc</w>": 21434, "ðŁĳĮðŁĳĮ</w>": 21435, "vist</w>": 21436, "suarez</w>": 21437, "injection</w>": 21438, "blooms</w>": 21439, "ðŁļ¨ðŁļ¨</w>": 21440, "stiff</w>": 21441, "paypal</w>": 21442, "snowing</w>": 21443, "thursdays</w>": 21444, "goose": 21445, "wedge</w>": 21446, "educated</w>": 21447, "weakness</w>": 21448, "decker</w>": 21449, "abudha": 21450, "breezy</w>": 21451, "ÛĮ": 21452, "hopeful</w>": 21453, "obi": 21454, "raider</w>": 21455, "gham": 21456, "deu": 21457, "seve": 21458, "partly</w>": 21459, "fut": 21460, "infused</w>": 21461, "merri": 21462, "thane</w>": 21463, "sometime</w>": 21464, "hue</w>": 21465, "mein</w>": 21466, "credit": 21467, "sliding</w>": 21468, "rande</w>": 21469, "cherry": 21470, "deadpool</w>": 21471, "shol": 21472, "aram</w>": 21473, "underwood</w>": 21474, "skye</w>": 21475, "disturbing</w>": 21476, "mnt</w>": 21477, "polished</w>": 21478, "guardians</w>": 21479, "hadn</w>": 21480, "picasso</w>": 21481, "arius</w>": 21482, "akshay": 21483, "irri": 21484, "jh</w>": 21485, "happen": 21486, "lakh</w>": 21487, "dalton</w>": 21488, "atthe": 21489, "swell</w>": 21490, "marsha</w>": 21491, "reh": 21492, "cours</w>": 21493, "jkt</w>": 21494, "topus</w>": 21495, "service": 21496, "rink</w>": 21497, "hackers</w>": 21498, "donovan</w>": 21499, "horo": 21500, "tcm": 21501, "mayhem</w>": 21502, "chase": 21503, "devops</w>": 21504, "kensing": 21505, "scup</w>": 21506, "shere</w>": 21507, "qualification</w>": 21508, "clive</w>": 21509, "tong</w>": 21510, "nancy": 21511, "maris": 21512, "derdale</w>": 21513, "berman</w>": 21514, "cinderella</w>": 21515, "jolly</w>": 21516, "cic</w>": 21517, "loot</w>": 21518, "collectibles</w>": 21519, "homicide</w>": 21520, "gge": 21521, "epidemic</w>": 21522, "suites</w>": 21523, "muddy</w>": 21524, "gimme</w>": 21525, "erec": 21526, "-*</w>": 21527, "talla": 21528, "lisle</w>": 21529, "embroide": 21530, "ðŁĩ©ðŁĩª</w>": 21531, "verizon</w>": 21532, "vector</w>": 21533, "beanie</w>": 21534, "artisan</w>": 21535, "gain": 21536, "flores</w>": 21537, "vigil": 21538, "uso</w>": 21539, "ðŁĻıðŁı½</w>": 21540, "grinding</w>": 21541, "gher": 21542, "airports</w>": 21543, "responsive</w>": 21544, "shaft</w>": 21545, "cancel</w>": 21546, "ceremonies</w>": 21547, "eme</w>": 21548, "atari</w>": 21549, "brushes</w>": 21550, "eager</w>": 21551, "bohemi": 21552, "childrens</w>": 21553, "yankee</w>": 21554, "maa</w>": 21555, "suspense</w>": 21556, "moran</w>": 21557, "macar": 21558, "sunflower</w>": 21559, "crew": 21560, "void</w>": 21561, "kear": 21562, "fashioned</w>": 21563, "jennings</w>": 21564, "sundayfunday</w>": 21565, "submissions</w>": 21566, "mead</w>": 21567, "herman</w>": 21568, "wai</w>": 21569, "critically</w>": 21570, "leum</w>": 21571, "baekhyun</w>": 21572, "forcing</w>": 21573, "cobra</w>": 21574, "ãģ®": 21575, "acquire</w>": 21576, "alk</w>": 21577, "geology</w>": 21578, "primar": 21579, "importantly</w>": 21580, "irez</w>": 21581, "bundesliga</w>": 21582, "curiosity</w>": 21583, "sena</w>": 21584, "strict</w>": 21585, "consoli": 21586, "winters</w>": 21587, "venom</w>": 21588, "cheltenham</w>": 21589, "ðŁįº</w>": 21590, "cena</w>": 21591, "tat</w>": 21592, "bain</w>": 21593, "glover</w>": 21594, "undercover</w>": 21595, "asses</w>": 21596, "carn": 21597, "memorialday</w>": 21598, "ameli": 21599, "irene</w>": 21600, "chon</w>": 21601, "synthesis</w>": 21602, "speedy</w>": 21603, "mitsubi": 21604, "slayer</w>": 21605, "composite</w>": 21606, "understands</w>": 21607, "pew": 21608, "interrup": 21609, "henri</w>": 21610, "morrow</w>": 21611, "anom": 21612, "thofjuly</w>": 21613, "glee</w>": 21614, "three": 21615, "ðŁĺ®</w>": 21616, "andhi</w>": 21617, "chatt": 21618, "renewables</w>": 21619, "yes": 21620, "transfers</w>": 21621, "!!!!!!!!</w>": 21622, "babu</w>": 21623, "duter": 21624, "loops</w>": 21625, "peers</w>": 21626, "oilers</w>": 21627, "paulo</w>": 21628, "ication</w>": 21629, "hmu</w>": 21630, "wara</w>": 21631, "mercer</w>": 21632, "homeland</w>": 21633, "fuji</w>": 21634, "aley</w>": 21635, "yearbook</w>": 21636, "rem</w>": 21637, "reen": 21638, "absur": 21639, "bois</w>": 21640, "]:</w>": 21641, "caesar</w>": 21642, "shotgun</w>": 21643, "kurdish</w>": 21644, "oren": 21645, "rae": 21646, "ancies</w>": 21647, "typic": 21648, "fh": 21649, "default</w>": 21650, "replic": 21651, "luk</w>": 21652, "transactions</w>": 21653, "rys</w>": 21654, "infantry</w>": 21655, "ðŁį¾</w>": 21656, "chow</w>": 21657, "chickens</w>": 21658, "bagh": 21659, "wyatt</w>": 21660, "aye": 21661, "ggi</w>": 21662, "brews</w>": 21663, "editions</w>": 21664, "mira": 21665, "commencement</w>": 21666, "presu": 21667, "periscope</w>": 21668, "ichi": 21669, "guatemala</w>": 21670, "zambia</w>": 21671, "paints</w>": 21672, "witches</w>": 21673, "wani</w>": 21674, "undere": 21675, "croy": 21676, "vows</w>": 21677, "usmc</w>": 21678, "hearted</w>": 21679, "theatres</w>": 21680, "shuffle</w>": 21681, "level": 21682, "multic": 21683, "squeeze</w>": 21684, "fern</w>": 21685, "appet": 21686, "postal</w>": 21687, "malt</w>": 21688, "onboard</w>": 21689, "ldnt</w>": 21690, "coo</w>": 21691, "ssc": 21692, "kac": 21693, "ðŁĺĩ": 21694, "scrap</w>": 21695, "marcos</w>": 21696, "dealers</w>": 21697, "annu": 21698, "miller": 21699, "cove": 21700, "ulary</w>": 21701, "vladimir</w>": 21702, "beef": 21703, "thur</w>": 21704, "pickled</w>": 21705, "sesame</w>": 21706, "bengaluru</w>": 21707, "mott</w>": 21708, "kathleen</w>": 21709, "hist": 21710, "notor": 21711, "drank</w>": 21712, "duchess</w>": 21713, "snowfall</w>": 21714, "eff</w>": 21715, "tiny": 21716, "jn</w>": 21717, "syour": 21718, "specialists</w>": 21719, "scotus</w>": 21720, "baylor</w>": 21721, "everest</w>": 21722, "malibu</w>": 21723, "prem</w>": 21724, "harmful</w>": 21725, "lali": 21726, "bates</w>": 21727, "gye": 21728, "differenti": 21729, "andra</w>": 21730, "geometry</w>": 21731, "elover</w>": 21732, "blackout</w>": 21733, "====": 21734, "kota</w>": 21735, "interact</w>": 21736, "asian": 21737, "layo": 21738, "samurai</w>": 21739, "fidel": 21740, "exhausted</w>": 21741, "gladi": 21742, "pdt</w>": 21743, "spheric</w>": 21744, "antiqu": 21745, "guitar": 21746, "sturi": 21747, "hopper</w>": 21748, "angle": 21749, "fills</w>": 21750, "slap</w>": 21751, "mith": 21752, "rodney</w>": 21753, "ongi</w>": 21754, "insom": 21755, "preventing</w>": 21756, "cassidy</w>": 21757, "apho": 21758, "oregon": 21759, "loin</w>": 21760, "hammond</w>": 21761, "contributing</w>": 21762, "fn</w>": 21763, "garri": 21764, "orion</w>": 21765, "compelling</w>": 21766, "escaping</w>": 21767, "aiming</w>": 21768, "plumb": 21769, "bistro</w>": 21770, "beasts</w>": 21771, "concerning</w>": 21772, "boe</w>": 21773, "dopp": 21774, "shoplocal</w>": 21775, "stumbled</w>": 21776, "âĤ¹</w>": 21777, "nazis</w>": 21778, "âĢįâĻĤï¸ı": 21779, "gesture</w>": 21780, "warts</w>": 21781, "usopen</w>": 21782, "higgins</w>": 21783, "charli": 21784, "hangs</w>": 21785, "bombers</w>": 21786, "°:</w>": 21787, "feeds</w>": 21788, "cch": 21789, "stil": 21790, "nicola</w>": 21791, "ðŁĵº": 21792, "clamation</w>": 21793, "tropic": 21794, "afro</w>": 21795, "ouk</w>": 21796, "expenses</w>": 21797, "derrick</w>": 21798, "aline</w>": 21799, "faw": 21800, "regard</w>": 21801, "imer</w>": 21802, "satin</w>": 21803, "thium</w>": 21804, "ryder</w>": 21805, "pearl": 21806, "tess": 21807, "mmmmm</w>": 21808, "senses</w>": 21809, "ðŁĩ¹": 21810, "positive": 21811, "exhaust</w>": 21812, "occur</w>": 21813, "norris</w>": 21814, "lilly</w>": 21815, "isles</w>": 21816, "directing</w>": 21817, "yofficial</w>": 21818, "countless</w>": 21819, "samar": 21820, "onstage</w>": 21821, "flock</w>": 21822, "mirrors</w>": 21823, "archer</w>": 21824, "moi</w>": 21825, "kd": 21826, "viv": 21827, "inos</w>": 21828, "sikh</w>": 21829, "lei</w>": 21830, "sensory</w>": 21831, "brits</w>": 21832, "knox</w>": 21833, "chestnut</w>": 21834, "opy</w>": 21835, "coliseum</w>": 21836, "zaf": 21837, "divin": 21838, "adapter</w>": 21839, ":)))</w>": 21840, "temple": 21841, "kun</w>": 21842, "helmets</w>": 21843, "tdf</w>": 21844, "guide": 21845, "mold</w>": 21846, "oids</w>": 21847, "luther": 21848, "heis": 21849, "monastery</w>": 21850, "spree</w>": 21851, "klu": 21852, "britney</w>": 21853, "jaguars</w>": 21854, "greats</w>": 21855, "ccc</w>": 21856, "kyrie</w>": 21857, "machinery</w>": 21858, "cricket": 21859, "rero</w>": 21860, "abo</w>": 21861, "aspiring</w>": 21862, "semifinals</w>": 21863, "aless": 21864, "signatures</w>": 21865, "vard": 21866, "meth": 21867, "herbal</w>": 21868, "holden</w>": 21869, "kingdom": 21870, "apor": 21871, "reggie</w>": 21872, "oreo</w>": 21873, "palestinians</w>": 21874, "emmys</w>": 21875, "sectional</w>": 21876, "roi</w>": 21877, "neymar</w>": 21878, "quel</w>": 21879, "cull": 21880, "lka</w>": 21881, "hazel</w>": 21882, "estimate</w>": 21883, "ulties</w>": 21884, "gow": 21885, "bea": 21886, "purchases</w>": 21887, "belts</w>": 21888, "protects</w>": 21889, "mÃ©": 21890, "guessing</w>": 21891, "bbo</w>": 21892, "claudia</w>": 21893, "fracking</w>": 21894, "jonny</w>": 21895, "elk</w>": 21896, "celtic": 21897, "almighty</w>": 21898, "raje": 21899, "courtyard</w>": 21900, "igi</w>": 21901, "canes</w>": 21902, "ðŁĴªðŁı»</w>": 21903, "bankrup": 21904, "lethal</w>": 21905, "âľĮï¸ı": 21906, "graphicdesign</w>": 21907, "vader</w>": 21908, "pencils</w>": 21909, "roughly</w>": 21910, "dante</w>": 21911, "mfg</w>": 21912, "constell": 21913, "camel</w>": 21914, "jb": 21915, "blossoms</w>": 21916, "ento": 21917, "balochistan</w>": 21918, "cinemato": 21919, "illard</w>": 21920, "jersey": 21921, "consent</w>": 21922, "dented</w>": 21923, "contempl": 21924, "scher": 21925, "holi</w>": 21926, "lough": 21927, "stour</w>": 21928, "ayo": 21929, "beginners</w>": 21930, "curb</w>": 21931, "vhs</w>": 21932, "ajax</w>": 21933, "duff</w>": 21934, "aveng": 21935, "domest": 21936, "committing</w>": 21937, "aired</w>": 21938, "chap</w>": 21939, "hedgehog</w>": 21940, "disappointing</w>": 21941, "freelance</w>": 21942, "inland</w>": 21943, "charms</w>": 21944, "ðŁĺįâĿ¤ï¸ı</w>": 21945, "aish": 21946, "mx": 21947, "buckle</w>": 21948, "tidal</w>": 21949, "permit</w>": 21950, "boating</w>": 21951, "racha": 21952, "kendrick</w>": 21953, "bello</w>": 21954, "bhi</w>": 21955, "plea</w>": 21956, "estimates</w>": 21957, "lb": 21958, "apologies</w>": 21959, "jaya</w>": 21960, "bbl</w>": 21961, "astoni": 21962, "interstate</w>": 21963, "maintaining</w>": 21964, "elbow</w>": 21965, "mup</w>": 21966, "epit": 21967, "ðŁĺ¡": 21968, "violations</w>": 21969, "defend": 21970, "beh": 21971, "slc</w>": 21972, "amir</w>": 21973, "puri</w>": 21974, "tium</w>": 21975, "fifa": 21976, "blurry</w>": 21977, "scrim": 21978, "ðŁĻıðŁı¾</w>": 21979, "maple": 21980, "relatives</w>": 21981, "âĺĿ": 21982, "choc</w>": 21983, "connor": 21984, "âľ¨âľ¨</w>": 21985, "whisp": 21986, "listings</w>": 21987, "maze</w>": 21988, "thanking</w>": 21989, "ridd": 21990, "grassroots</w>": 21991, "shifting</w>": 21992, "desperately</w>": 21993, "gorilla</w>": 21994, "deni": 21995, "jules</w>": 21996, "strath": 21997, "gley</w>": 21998, "jain</w>": 21999, "buick</w>": 22000, "tanner</w>": 22001, "ðŁĴĿ</w>": 22002, "gae</w>": 22003, "prim": 22004, "itors</w>": 22005, "nano</w>": 22006, "separation</w>": 22007, "armenia</w>": 22008, "bordeaux</w>": 22009, "ðŁħ": 22010, "pjnet</w>": 22011, "burial</w>": 22012, "ebon": 22013, "gloss</w>": 22014, "renew</w>": 22015, "grier</w>": 22016, "speeds</w>": 22017, "comicbooks</w>": 22018, "symboli": 22019, "purposes</w>": 22020, "ãħłãħł": 22021, "spatial</w>": 22022, "notable</w>": 22023, "cion</w>": 22024, "nps</w>": 22025, "hoffman</w>": 22026, "norman": 22027, "rtg</w>": 22028, "dusty</w>": 22029, "situated</w>": 22030, "tran</w>": 22031, "kfc</w>": 22032, "emen</w>": 22033, "nickel</w>": 22034, "hastings</w>": 22035, "settling</w>": 22036, "grit": 22037, "lena</w>": 22038, "waw": 22039, "arts": 22040, "gum": 22041, "caregi": 22042, "lewis": 22043, "sapphire</w>": 22044, "remember": 22045, "embedded</w>": 22046, "tlc</w>": 22047, "blat": 22048, "sergeant</w>": 22049, "elsa</w>": 22050, "bootcamp</w>": 22051, "bowman</w>": 22052, "photographic</w>": 22053, "pillars</w>": 22054, "directioners</w>": 22055, "classified</w>": 22056, "nois": 22057, "veer</w>": 22058, "barrels</w>": 22059, "whoop</w>": 22060, "ðŁĺ±ðŁĺ±": 22061, "female": 22062, "petroleum</w>": 22063, "media": 22064, "efc</w>": 22065, "pokÃ©mon</w>": 22066, "à¤ķ</w>": 22067, "enthusiastic</w>": 22068, "varun</w>": 22069, "profiles</w>": 22070, "pediatric</w>": 22071, "accidents</w>": 22072, "conrad</w>": 22073, "jang</w>": 22074, "jojo</w>": 22075, "acor": 22076, "observer</w>": 22077, "lf</w>": 22078, "livestock</w>": 22079, "forgi": 22080, "fos</w>": 22081, "elm</w>": 22082, "anand</w>": 22083, "goe": 22084, "cere</w>": 22085, "avoiding</w>": 22086, "grit</w>": 22087, "oman": 22088, "thankfully</w>": 22089, "scattered</w>": 22090, "nicky</w>": 22091, "cylinder</w>": 22092, "cheesy</w>": 22093, "diver</w>": 22094, "mahesh</w>": 22095, "caves</w>": 22096, "earliest</w>": 22097, "quinte": 22098, "subjects</w>": 22099, "bend": 22100, "gulf": 22101, "vocalist</w>": 22102, "glue</w>": 22103, "patches</w>": 22104, "unstopp": 22105, "snyder</w>": 22106, "demonstrating</w>": 22107, "pio</w>": 22108, "horns</w>": 22109, "wickets</w>": 22110, "andthe": 22111, "rama</w>": 22112, "yoon</w>": 22113, "straight": 22114, "bedtime</w>": 22115, "orang": 22116, "bullets</w>": 22117, "saurus</w>": 22118, "miners</w>": 22119, "incidents</w>": 22120, "!...</w>": 22121, "ðŁİ¸</w>": 22122, "agers</w>": 22123, "handles</w>": 22124, "states": 22125, "inity</w>": 22126, "dons</w>": 22127, "incredible": 22128, "eminem</w>": 22129, "aviv</w>": 22130, "rudy</w>": 22131, "mozart</w>": 22132, "folklore": 22133, "appliances</w>": 22134, "mtl</w>": 22135, "frey": 22136, "dias": 22137, "hua</w>": 22138, "pageant</w>": 22139, "strive</w>": 22140, "imprison": 22141, "bullish</w>": 22142, "rana</w>": 22143, "alerts</w>": 22144, "bbmas</w>": 22145, "hyper</w>": 22146, "derbyshire</w>": 22147, "recre": 22148, "redd": 22149, "deborah</w>": 22150, "cosmos</w>": 22151, "lawson</w>": 22152, "melanie</w>": 22153, "psycho</w>": 22154, "hoor": 22155, "doodles</w>": 22156, "sniper</w>": 22157, "shady</w>": 22158, "mantle</w>": 22159, "canadian": 22160, "newyear": 22161, "interactions</w>": 22162, "separated</w>": 22163, "cords</w>": 22164, "spirituality</w>": 22165, "apu": 22166, "ito": 22167, "pct</w>": 22168, "pelosi</w>": 22169, "rebellion</w>": 22170, "seiz": 22171, "worcester": 22172, "sectors</w>": 22173, "uli</w>": 22174, "santa": 22175, "Ðµ": 22176, "ðŁĩªðŁĩ¸</w>": 22177, "biased</w>": 22178, "classical": 22179, "gamma</w>": 22180, "deeplear": 22181, "emerge</w>": 22182, "backer</w>": 22183, "surance</w>": 22184, "handcrafted</w>": 22185, "ðŁİ¥": 22186, "francis": 22187, "millan</w>": 22188, "ici</w>": 22189, "crown": 22190, "wow": 22191, "striped</w>": 22192, "unfair</w>": 22193, "relaxation</w>": 22194, "³ï¸ı": 22195, "embracing</w>": 22196, "shealth</w>": 22197, "paleo</w>": 22198, "martini</w>": 22199, "distillery</w>": 22200, "wrink": 22201, "ork": 22202, "nath": 22203, "hayley</w>": 22204, "courthouse</w>": 22205, "siber": 22206, "sadi": 22207, "quietly</w>": 22208, "melt": 22209, "msm</w>": 22210, "meh</w>": 22211, "smartphones</w>": 22212, "relent": 22213, "pping": 22214, "warwick</w>": 22215, "cologne</w>": 22216, "glia</w>": 22217, "cotton": 22218, "prog</w>": 22219, "lone": 22220, "ipsw": 22221, "starters</w>": 22222, "expands</w>": 22223, "ump": 22224, "sued</w>": 22225, "skipper</w>": 22226, "infections</w>": 22227, "ingle": 22228, "Ã¡</w>": 22229, "clerk</w>": 22230, "demonstrate</w>": 22231, "acar": 22232, "ðŁĺĤðŁĺĤðŁĺĤ": 22233, "tibet</w>": 22234, "buns</w>": 22235, "alom</w>": 22236, "demolition</w>": 22237, "ssia</w>": 22238, "gst</w>": 22239, "[]</w>": 22240, "soar</w>": 22241, "âĺĢ</w>": 22242, "ðŁĺª</w>": 22243, "ðŁĵĬ</w>": 22244, "deepest</w>": 22245, "beyond": 22246, "aret</w>": 22247, "attends</w>": 22248, "activated</w>": 22249, "dimit": 22250, "âļªï¸ı": 22251, "highlighted</w>": 22252, "magazines</w>": 22253, "rumor</w>": 22254, "azza</w>": 22255, "stephens</w>": 22256, "dolph</w>": 22257, "shockey</w>": 22258, "mats</w>": 22259, "weav": 22260, "melan": 22261, "servers</w>": 22262, "traum": 22263, "kush": 22264, "æĹ": 22265, "babys": 22266, "paz</w>": 22267, "aal": 22268, "lause</w>": 22269, "breakers</w>": 22270, "canterbury</w>": 22271, "ulture</w>": 22272, "miri": 22273, "euros</w>": 22274, "taneous</w>": 22275, "impressions</w>": 22276, "dutch": 22277, "ild": 22278, "ghi</w>": 22279, "purdue</w>": 22280, "adequate</w>": 22281, "lp": 22282, "syner": 22283, "angler</w>": 22284, "durable</w>": 22285, "galore</w>": 22286, "rown": 22287, "mgmt</w>": 22288, "ðŁĵĮ</w>": 22289, "lucia</w>": 22290, "âĺĳï¸ı</w>": 22291, "zayn": 22292, "borrow</w>": 22293, ".(</w>": 22294, "northumber": 22295, "crush": 22296, "enga</w>": 22297, "sush": 22298, "extravag": 22299, "tout</w>": 22300, "mahal</w>": 22301, "alistic</w>": 22302, "thermo": 22303, "galleries</w>": 22304, "esse</w>": 22305, "chibi</w>": 22306, "attractions</w>": 22307, "lexington</w>": 22308, "legislature</w>": 22309, "documented</w>": 22310, "residen": 22311, "brownies</w>": 22312, "wf</w>": 22313, "stool</w>": 22314, "planets</w>": 22315, "shoppers</w>": 22316, "conductor</w>": 22317, "msp</w>": 22318, "tricky</w>": 22319, "fruity</w>": 22320, "endra</w>": 22321, "feelthe": 22322, "whipped</w>": 22323, "hairstyle</w>": 22324, "refer</w>": 22325, "ook": 22326, "octopus</w>": 22327, "audiences</w>": 22328, "kumar": 22329, "afterno": 22330, "optim": 22331, "cfl</w>": 22332, "nip</w>": 22333, "geni": 22334, "alphabet</w>": 22335, "annab": 22336, "lamin": 22337, "accepts</w>": 22338, "lng</w>": 22339, "ðŁĺ«</w>": 22340, "tine</w>": 22341, "acom</w>": 22342, "cheerleaders</w>": 22343, "tk": 22344, "gron": 22345, "vg</w>": 22346, "kung</w>": 22347, "jax": 22348, "dhabi</w>": 22349, "rss</w>": 22350, "mackenzie</w>": 22351, "beirut</w>": 22352, "cleanup</w>": 22353, "gypsy</w>": 22354, "stell": 22355, "burger": 22356, "hurricanes</w>": 22357, "education": 22358, "stina</w>": 22359, "âĻ¡âĻ¡": 22360, "unfortunate</w>": 22361, "jeremi": 22362, "badger</w>": 22363, "aters</w>": 22364, ":âĢ¦</w>": 22365, "terra": 22366, "sublime</w>": 22367, "stud": 22368, "ymca</w>": 22369, "mru</w>": 22370, "duterte</w>": 22371, "brennan</w>": 22372, "bulb</w>": 22373, "melo</w>": 22374, "ylon</w>": 22375, "hacker</w>": 22376, "cred</w>": 22377, "gud</w>": 22378, "asan": 22379, "padilla</w>": 22380, "embroidered</w>": 22381, "vietnamese</w>": 22382, "pioneers</w>": 22383, "projection</w>": 22384, "reboot</w>": 22385, "idc</w>": 22386, "aney</w>": 22387, "primer</w>": 22388, "suffers</w>": 22389, "winding</w>": 22390, "pon</w>": 22391, "stoday</w>": 22392, "morn</w>": 22393, "uch</w>": 22394, "allin</w>": 22395, "adidas": 22396, "elizabeth": 22397, "tuck</w>": 22398, "ography</w>": 22399, "ðŁļĢ": 22400, "beg</w>": 22401, "osborne</w>": 22402, "ghetto</w>": 22403, "rh</w>": 22404, "cnn": 22405, "irma</w>": 22406, "makin</w>": 22407, "cables</w>": 22408, "murders</w>": 22409, "ocks</w>": 22410, "insta": 22411, "alas</w>": 22412, "sik</w>": 22413, "cuff</w>": 22414, "lare": 22415, "foodies</w>": 22416, "ovic</w>": 22417, "atom": 22418, "geometric</w>": 22419, "empathy</w>": 22420, "à¸µ": 22421, "centenary</w>": 22422, "newspapers</w>": 22423, "administrative</w>": 22424, "ðŁİĬ</w>": 22425, "stive</w>": 22426, "contractors</w>": 22427, "lett": 22428, "tasmania</w>": 22429, "awesomeness</w>": 22430, "density</w>": 22431, "veen</w>": 22432, "princeton</w>": 22433, "frequently</w>": 22434, "reject</w>": 22435, "ghi": 22436, "modular</w>": 22437, "ceramics</w>": 22438, "shag": 22439, "kiwi</w>": 22440, "canvas": 22441, "sweatshirt</w>": 22442, "anj": 22443, "timm": 22444, "napoli</w>": 22445, "iler": 22446, "appeals</w>": 22447, "hamilton": 22448, "mayo": 22449, "weave</w>": 22450, "arranged</w>": 22451, "wharf</w>": 22452, "occupy": 22453, "bvb</w>": 22454, "asaki</w>": 22455, "otter</w>": 22456, "norm</w>": 22457, "vies</w>": 22458, "detox</w>": 22459, "tional": 22460, "derek": 22461, "idad</w>": 22462, "admissions</w>": 22463, "constituency</w>": 22464, "upper": 22465, "woot</w>": 22466, "alloy</w>": 22467, "seve</w>": 22468, "lub": 22469, "uncomfortable</w>": 22470, "edwin</w>": 22471, "abre": 22472, "dwight</w>": 22473, "arche": 22474, "virtually</w>": 22475, "spol": 22476, "prie": 22477, "aii</w>": 22478, "err": 22479, "switch": 22480, "barack</w>": 22481, "seok</w>": 22482, "coul": 22483, "wnt</w>": 22484, "poul": 22485, "olive": 22486, "caffeine</w>": 22487, "cardiff": 22488, "notorious</w>": 22489, "demp": 22490, "excess</w>": 22491, "barr</w>": 22492, "tford</w>": 22493, "ajay": 22494, "bumped</w>": 22495, "mythology</w>": 22496, "shelley</w>": 22497, "falcon": 22498, "shakespeare": 22499, "mustangs</w>": 22500, "noted</w>": 22501, "bone": 22502, "civilization</w>": 22503, "syd</w>": 22504, "parsons</w>": 22505, "unofficial</w>": 22506, "hyped</w>": 22507, "spends</w>": 22508, "opposed</w>": 22509, "vings</w>": 22510, "spacex</w>": 22511, "notification</w>": 22512, "deciding</w>": 22513, "biotech</w>": 22514, "outsi": 22515, "salah</w>": 22516, "!.</w>": 22517, "fed": 22518, "ssy": 22519, "cms</w>": 22520, "badgers</w>": 22521, "cro</w>": 22522, "elaine</w>": 22523, "nba": 22524, "dyour": 22525, "nant</w>": 22526, "honeymoon</w>": 22527, "climbed</w>": 22528, "conomy</w>": 22529, "atha</w>": 22530, "mell": 22531, "nebula</w>": 22532, "naturephotography</w>": 22533, "julie": 22534, "bmx</w>": 22535, "invested</w>": 22536, "mono</w>": 22537, "lieutenant</w>": 22538, "watkins</w>": 22539, "technician</w>": 22540, "ose</w>": 22541, "kae": 22542, "ìĽ": 22543, "mcqueen</w>": 22544, "preach</w>": 22545, "traveller</w>": 22546, "flexibility</w>": 22547, "zebra</w>": 22548, "retailer</w>": 22549, "pant</w>": 22550, "bender</w>": 22551, "brandt</w>": 22552, "squid</w>": 22553, "warrant</w>": 22554, "verified</w>": 22555, "cass</w>": 22556, "piercing</w>": 22557, "honours</w>": 22558, "tying</w>": 22559, "morris": 22560, "kissed</w>": 22561, "oprah</w>": 22562, "panoramic</w>": 22563, "mei": 22564, "splatoon</w>": 22565, "wichita</w>": 22566, "arias</w>": 22567, "galli": 22568, "indyref</w>": 22569, "goodtimes</w>": 22570, "atheist</w>": 22571, "confession</w>": 22572, "owski</w>": 22573, "repping</w>": 22574, "additions</w>": 22575, "mechanism</w>": 22576, "zim</w>": 22577, "jans</w>": 22578, "suf</w>": 22579, "chopped</w>": 22580, "beginnings</w>": 22581, "vitamins</w>": 22582, "ãħ¤ãħ¤": 22583, "orth": 22584, "poles</w>": 22585, "rub</w>": 22586, "antarctica</w>": 22587, "indiefilm</w>": 22588, "webcam</w>": 22589, "ketch": 22590, "brett": 22591, "clement": 22592, "heron</w>": 22593, "defeating</w>": 22594, "hydro</w>": 22595, "bucket": 22596, "wandering</w>": 22597, "sidney</w>": 22598, "futureof": 22599, "binge</w>": 22600, "onies</w>": 22601, "knockout</w>": 22602, "administrator</w>": 22603, "synthe": 22604, "lent</w>": 22605, "jani</w>": 22606, "barley</w>": 22607, "premierleague</w>": 22608, "nerds</w>": 22609, "crm</w>": 22610, "bras</w>": 22611, "botany</w>": 22612, "evolved</w>": 22613, "rotter": 22614, "rowed</w>": 22615, "tumor</w>": 22616, "wealthy</w>": 22617, "ÂŃ</w>": 22618, "monarch</w>": 22619, "lished</w>": 22620, "dahl</w>": 22621, "ðŁİĥ": 22622, "buch": 22623, "kenyan</w>": 22624, "Ø§</w>": 22625, "redness</w>": 22626, "assembled</w>": 22627, "semit": 22628, "hudder": 22629, "shrop": 22630, "rani</w>": 22631, "learning": 22632, "mory</w>": 22633, "itia</w>": 22634, "geographic</w>": 22635, "worldof": 22636, "fb": 22637, "phosp": 22638, "boogie</w>": 22639, "amped</w>": 22640, "?...</w>": 22641, "chew</w>": 22642, "dwarf</w>": 22643, "arus</w>": 22644, "ssen</w>": 22645, "rusty</w>": 22646, "recruits</w>": 22647, "hk": 22648, "garde</w>": 22649, "applause</w>": 22650, "volumes</w>": 22651, "involves</w>": 22652, "tac</w>": 22653, "handbag</w>": 22654, "translate</w>": 22655, "ffel</w>": 22656, "seym": 22657, "aquatic</w>": 22658, "transfer": 22659, "zodi": 22660, "andr": 22661, "academia</w>": 22662, "crater</w>": 22663, "tez</w>": 22664, "arse</w>": 22665, "adapt</w>": 22666, "coloni": 22667, "snowman</w>": 22668, "mali</w>": 22669, "hangin</w>": 22670, "dischar": 22671, "oysters</w>": 22672, "phoe": 22673, "colonel</w>": 22674, "wba</w>": 22675, "hispanic</w>": 22676, "thriving</w>": 22677, "shy": 22678, "agles</w>": 22679, "salesforce</w>": 22680, "creme</w>": 22681, "soles</w>": 22682, "lafayette</w>": 22683, "âī": 22684, "teria</w>": 22685, "acha</w>": 22686, "sperson</w>": 22687, "gogo</w>": 22688, "carly</w>": 22689, "theore": 22690, "amore</w>": 22691, "vox</w>": 22692, "aft</w>": 22693, "ãĤ¹": 22694, "staple</w>": 22695, "muffin</w>": 22696, "diagram</w>": 22697, "inox</w>": 22698, "sustained</w>": 22699, "avent": 22700, "meta</w>": 22701, "arbitr": 22702, "decay</w>": 22703, "adole": 22704, "Ð½": 22705, "ecol": 22706, "pho</w>": 22707, "nk": 22708, "ocu": 22709, "granny</w>": 22710, "Ã§a</w>": 22711, "luxembour": 22712, "stadt</w>": 22713, "alberto</w>": 22714, "levit": 22715, "amas": 22716, "dx": 22717, "orphan": 22718, "cobb</w>": 22719, "asc": 22720, "logy": 22721, "immense</w>": 22722, "chants</w>": 22723, "offline</w>": 22724, "pent</w>": 22725, "brex": 22726, "winger</w>": 22727, "plane": 22728, "iel</w>": 22729, "nichols</w>": 22730, "cathy</w>": 22731, "naruto</w>": 22732, "lowed</w>": 22733, "///</w>": 22734, "ignorance</w>": 22735, "catastro": 22736, "youts</w>": 22737, "schen": 22738, "build": 22739, "hazi</w>": 22740, "sine": 22741, "criticalrole</w>": 22742, "dug": 22743, "detect</w>": 22744, "logs</w>": 22745, "enamel</w>": 22746, "stpatricksday</w>": 22747, "eddie": 22748, "copa</w>": 22749, "cigarettes</w>": 22750, "hoff</w>": 22751, "kaya</w>": 22752, "lagoon</w>": 22753, "rapha": 22754, "airborne</w>": 22755, "choose": 22756, "puertor": 22757, "kev": 22758, "guiding</w>": 22759, "frosty</w>": 22760, "borough": 22761, "mira</w>": 22762, "ðŁİĬ": 22763, "cadet</w>": 22764, "anush": 22765, "yogi</w>": 22766, "eger</w>": 22767, "fling</w>": 22768, "slope</w>": 22769, "ninth</w>": 22770, "weston</w>": 22771, "footwear</w>": 22772, "fn": 22773, "mayweather</w>": 22774, "aam</w>": 22775, "plain": 22776, "staircase</w>": 22777, "witnesses</w>": 22778, "workouts</w>": 22779, "robust</w>": 22780, "dexter</w>": 22781, "cohort</w>": 22782, "ðŁļĹ</w>": 22783, "spell": 22784, "haze</w>": 22785, "oom": 22786, "organising</w>": 22787, "wildfire</w>": 22788, "contacts</w>": 22789, "avon": 22790, "mino</w>": 22791, "updating</w>": 22792, "ðŁį»": 22793, "lithium</w>": 22794, "ingual</w>": 22795, "kis</w>": 22796, "auga</w>": 22797, "locom": 22798, "deduc": 22799, "uda</w>": 22800, "thak": 22801, "boyle</w>": 22802, "mper</w>": 22803, "hottie</w>": 22804, "erik": 22805, "revised</w>": 22806, "isla</w>": 22807, "travelphotography</w>": 22808, "ooza</w>": 22809, "enqui": 22810, "conferences</w>": 22811, "clover</w>": 22812, "groom</w>": 22813, "curves</w>": 22814, "liveon": 22815, "perf</w>": 22816, "displaced</w>": 22817, "bolog": 22818, "xxxx</w>": 22819, "ðŁĺ©ðŁĺ©": 22820, "teal</w>": 22821, "vessels</w>": 22822, "rainforest</w>": 22823, "calci": 22824, "panther": 22825, "giraffe</w>": 22826, "tasted</w>": 22827, "imagery</w>": 22828, "padres</w>": 22829, "daytime</w>": 22830, "bass": 22831, "ripe</w>": 22832, "opioid</w>": 22833, "nue": 22834, "vinyl": 22835, "inventor</w>": 22836, "sens</w>": 22837, "processor</w>": 22838, "mut</w>": 22839, "gadgets</w>": 22840, "biblical</w>": 22841, "shannon": 22842, "jacqueline</w>": 22843, "cary</w>": 22844, "theresistance</w>": 22845, "alien": 22846, "nvi": 22847, "cosy</w>": 22848, "bihar</w>": 22849, "foley</w>": 22850, "rend</w>": 22851, "mugs</w>": 22852, "faken": 22853, "clone</w>": 22854, "niallo": 22855, "grabbed</w>": 22856, "chihu": 22857, "powerhouse</w>": 22858, "ntt</w>": 22859, "cherokee</w>": 22860, "sponge": 22861, "implementing</w>": 22862, "rhine": 22863, "leone</w>": 22864, "ðŁįĢ": 22865, "prettiest</w>": 22866, "infrared</w>": 22867, "improv</w>": 22868, "switched</w>": 22869, "tubes</w>": 22870, "contr": 22871, "blk</w>": 22872, "projected</w>": 22873, "beaver</w>": 22874, "yot": 22875, "bbcradio</w>": 22876, "thigh</w>": 22877, "persecu": 22878, "apologize</w>": 22879, "wack": 22880, "poster": 22881, "oliver": 22882, "aza</w>": 22883, "loud": 22884, "(?)</w>": 22885, "fthe": 22886, "womenshi": 22887, "sparrow</w>": 22888, "blush</w>": 22889, "usable</w>": 22890, "scales</w>": 22891, "itative</w>": 22892, "peuge": 22893, "needing</w>": 22894, "leggings</w>": 22895, "glamorous</w>": 22896, "matur": 22897, "cz": 22898, "watt": 22899, "dab</w>": 22900, "tamar": 22901, "etsym": 22902, "bauer</w>": 22903, "heartfelt</w>": 22904, "hn": 22905, "elsewhere</w>": 22906, "birch</w>": 22907, "alumini": 22908, "huck": 22909, "eme": 22910, "jl</w>": 22911, "trafford</w>": 22912, "dz</w>": 22913, "portions</w>": 22914, "anasta": 22915, "arthritis</w>": 22916, "espn": 22917, "bergen</w>": 22918, "violation</w>": 22919, "yoshi": 22920, "cz</w>": 22921, "northumberland</w>": 22922, "closures</w>": 22923, "ðŁĩ¯ðŁĩ": 22924, "smiley</w>": 22925, "rw</w>": 22926, "telugu</w>": 22927, "intensi": 22928, "gregg</w>": 22929, "vega</w>": 22930, "dungeon</w>": 22931, "southbound</w>": 22932, "bail": 22933, "dominican</w>": 22934, "semifinal</w>": 22935, "chapters</w>": 22936, "hitch": 22937, "vanity</w>": 22938, "transiti": 22939, "recommends</w>": 22940, "satisf": 22941, "barca</w>": 22942, "queens": 22943, "((": 22944, "destruc": 22945, "strait</w>": 22946, "ravi": 22947, "desserts</w>": 22948, "intru": 22949, "haram</w>": 22950, "kos</w>": 22951, "foe</w>": 22952, "fatty</w>": 22953, "paisley</w>": 22954, "magnitude</w>": 22955, "dridge</w>": 22956, "comey</w>": 22957, "schemes</w>": 22958, "visionary</w>": 22959, "ourt</w>": 22960, "downloaded</w>": 22961, "ðŁĻĮðŁı½</w>": 22962, "gdpr</w>": 22963, "lani</w>": 22964, "pwc</w>": 22965, "guad": 22966, "nicest</w>": 22967, "stakeholders</w>": 22968, "referred</w>": 22969, "georgetown</w>": 22970, "arvindkejriwal</w>": 22971, "schneider</w>": 22972, "indoors</w>": 22973, "allstar</w>": 22974, "stranded</w>": 22975, "gender": 22976, "zepp": 22977, "masses</w>": 22978, "ðŁĲ±</w>": 22979, "patiently</w>": 22980, "bldg</w>": 22981, "zab": 22982, "wearab": 22983, "vivid</w>": 22984, "heck": 22985, "della</w>": 22986, "symb": 22987, "jeopar": 22988, "lager</w>": 22989, "àª": 22990, "combines</w>": 22991, "nec</w>": 22992, "bray</w>": 22993, "flop</w>": 22994, "txwx</w>": 22995, "joys</w>": 22996, "pont</w>": 22997, "profound</w>": 22998, "surround</w>": 22999, "madhu": 23000, "mable</w>": 23001, "ayr": 23002, "teas": 23003, "nsa</w>": 23004, "openly</w>": 23005, "ernest</w>": 23006, "ãĥ©": 23007, "topo": 23008, "gna</w>": 23009, "antioxid": 23010, "tian": 23011, "etr": 23012, "cello</w>": 23013, "mathi": 23014, "generosity</w>": 23015, "biting</w>": 23016, "manic": 23017, "kelsey</w>": 23018, "cheeks</w>": 23019, "tender": 23020, "wth</w>": 23021, "pronoun": 23022, "ultimately</w>": 23023, "gusta": 23024, "arianag": 23025, "gerry</w>": 23026, "bleed": 23027, "reddy</w>": 23028, "mich</w>": 23029, "mitsubishi</w>": 23030, "operated</w>": 23031, "sexually</w>": 23032, "mau</w>": 23033, "cllr</w>": 23034, "vids</w>": 23035, "coc": 23036, "melted</w>": 23037, "ðŁĮĪ": 23038, "qld": 23039, "itech</w>": 23040, "instrumental</w>": 23041, "endgame</w>": 23042, "ðŁĵĸ</w>": 23043, "energi": 23044, "brownie</w>": 23045, "tamil": 23046, "atin</w>": 23047, "dominated</w>": 23048, "praises</w>": 23049, "fireplace</w>": 23050, "sensational</w>": 23051, "mena</w>": 23052, "karti": 23053, "unprece": 23054, "rupt</w>": 23055, "oriental</w>": 23056, "mccor": 23057, "tournaments</w>": 23058, "scenter</w>": 23059, "reeves</w>": 23060, "prescription</w>": 23061, "same": 23062, "frau": 23063, "truffle</w>": 23064, "embo": 23065, "romans</w>": 23066, "blasts</w>": 23067, "technological</w>": 23068, "prat": 23069, "bsb</w>": 23070, "yar</w>": 23071, "trendy</w>": 23072, "acl</w>": 23073, "alad": 23074, "ðŁįģ</w>": 23075, "ohh</w>": 23076, "bankrupt": 23077, "thoven</w>": 23078, "regards</w>": 23079, "iser": 23080, "warwick": 23081, "vineyards</w>": 23082, "realm</w>": 23083, "niallofficial</w>": 23084, "dota</w>": 23085, "gemini</w>": 23086, "todo</w>": 23087, "vable</w>": 23088, "Â¨Â¨": 23089, "lau</w>": 23090, "wreath</w>": 23091, "juve</w>": 23092, "natasha</w>": 23093, "lever</w>": 23094, "lori</w>": 23095, "horser": 23096, "cctv</w>": 23097, "airbnb</w>": 23098, "esanders</w>": 23099, "sinclair</w>": 23100, "emabiggest": 23101, "highschool</w>": 23102, "contest": 23103, "optimistic</w>": 23104, "tte": 23105, "ðŁĴķðŁĴķ": 23106, "ssd</w>": 23107, "yee</w>": 23108, "helena</w>": 23109, "consen": 23110, "ricks</w>": 23111, "jesse": 23112, "anic</w>": 23113, "ðŁİ¯</w>": 23114, "reacts</w>": 23115, "robe</w>": 23116, "independence": 23117, "voltage</w>": 23118, "mington</w>": 23119, "sant</w>": 23120, "à¸Ļà¸": 23121, "----------------": 23122, "sentinel</w>": 23123, "kett": 23124, "rehearsing</w>": 23125, "aaaaaaaa": 23126, "softhe": 23127, "stirling</w>": 23128, "search": 23129, "wigan</w>": 23130, "standout</w>": 23131, "snail</w>": 23132, "pentagon</w>": 23133, "Äģ": 23134, "chlor": 23135, "crust</w>": 23136, "netany": 23137, "chemist</w>": 23138, "disappeared</w>": 23139, "ricardo</w>": 23140, "spiders</w>": 23141, "bose</w>": 23142, "warren": 23143, "messing</w>": 23144, "banners</w>": 23145, "guel": 23146, "parach": 23147, "maid": 23148, "counted</w>": 23149, "epile": 23150, "bonfire</w>": 23151, "speechless</w>": 23152, "setter</w>": 23153, "measured</w>": 23154, "rejects</w>": 23155, "nikki": 23156, "lester": 23157, "forensic</w>": 23158, "fabrics</w>": 23159, "aloha</w>": 23160, "preserved</w>": 23161, "watford</w>": 23162, "detailing</w>": 23163, "darth</w>": 23164, "bou</w>": 23165, "carly": 23166, "...'</w>": 23167, "tailgate</w>": 23168, "notifications</w>": 23169, "å¤": 23170, "passive</w>": 23171, "trousers</w>": 23172, "baloch</w>": 23173, "rother": 23174, "typically</w>": 23175, "Ã¥": 23176, "spit</w>": 23177, "wiz</w>": 23178, "sicily</w>": 23179, "technically</w>": 23180, "expose</w>": 23181, "stage": 23182, "hubb": 23183, "cream": 23184, "caps</w>": 23185, "poke</w>": 23186, "sleek</w>": 23187, "june": 23188, "temporarily</w>": 23189, "dez": 23190, "awakens</w>": 23191, "lame</w>": 23192, "_-</w>": 23193, "jiha": 23194, "tuesdays</w>": 23195, "advised</w>": 23196, "advisors</w>": 23197, "existed</w>": 23198, "disagree</w>": 23199, "newsroom</w>": 23200, "losers</w>": 23201, "worldtour</w>": 23202, "drying</w>": 23203, "aldi</w>": 23204, "harness</w>": 23205, "footprint</w>": 23206, "hobbit</w>": 23207, "pmln</w>": 23208, "iro": 23209, "quered</w>": 23210, "assess</w>": 23211, "gaze</w>": 23212, "sab</w>": 23213, "thian</w>": 23214, "íĬ": 23215, "tif</w>": 23216, "observe</w>": 23217, "evil": 23218, "drawer</w>": 23219, "sweep": 23220, "cory": 23221, "cody": 23222, "kyoto</w>": 23223, "callum</w>": 23224, "ninj": 23225, "laurent</w>": 23226, "bei</w>": 23227, "sketching</w>": 23228, "customized</w>": 23229, "dur</w>": 23230, "regrets</w>": 23231, "knoxville</w>": 23232, "ìķĦ": 23233, "messaging</w>": 23234, "gracie</w>": 23235, "abundance</w>": 23236, "bidding</w>": 23237, "brewed</w>": 23238, "flouri": 23239, "therapeutic</w>": 23240, "altitude</w>": 23241, "hogs</w>": 23242, "burner</w>": 23243, "electro</w>": 23244, "wonderfully</w>": 23245, "heater</w>": 23246, "postpon": 23247, "livery</w>": 23248, "rall": 23249, "adas</w>": 23250, "aac": 23251, "saul</w>": 23252, "brooklyn": 23253, "playhouse</w>": 23254, "âĻ¥âĻ¥âĻ¥</w>": 23255, "charitable</w>": 23256, "iny</w>": 23257, "zah": 23258, "competitions</w>": 23259, "beav": 23260, "plugged</w>": 23261, "ois</w>": 23262, "doom": 23263, "astronom": 23264, "specialized</w>": 23265, "maxi</w>": 23266, "taps</w>": 23267, "cellular</w>": 23268, "depressed</w>": 23269, "folklorethursday</w>": 23270, "crib</w>": 23271, "emul": 23272, "ë°©": 23273, "figh": 23274, "ruz</w>": 23275, "carlisle</w>": 23276, "spear": 23277, "sidewalk</w>": 23278, "dei</w>": 23279, "dependent</w>": 23280, "laces</w>": 23281, "nhs": 23282, "ðŁĮĻ</w>": 23283, "realizing</w>": 23284, "network": 23285, "riche": 23286, "regin": 23287, "refresh</w>": 23288, "stral</w>": 23289, "pathology</w>": 23290, "plaid</w>": 23291, "psychedelic</w>": 23292, "hind</w>": 23293, "uka</w>": 23294, "algorithm</w>": 23295, "linking</w>": 23296, "progressi": 23297, "fey</w>": 23298, "dade</w>": 23299, "hydrated</w>": 23300, "bant": 23301, "famed</w>": 23302, "cotsw": 23303, "boise</w>": 23304, "asc</w>": 23305, "racing": 23306, "javier</w>": 23307, "wwen": 23308, "marlins</w>": 23309, "poop</w>": 23310, "swept</w>": 23311, "tonights</w>": 23312, "wef</w>": 23313, "anime": 23314, "slovak": 23315, "âŀĸâŀĸ": 23316, "claus</w>": 23317, "lemme</w>": 23318, "clippers</w>": 23319, "rels</w>": 23320, "arianagrande</w>": 23321, "rte</w>": 23322, "kot": 23323, "thalapathy</w>": 23324, "hungarian</w>": 23325, "zuma</w>": 23326, "yvon": 23327, "isu</w>": 23328, "journeys</w>": 23329, "clinics</w>": 23330, "bebe</w>": 23331, "wwf</w>": 23332, "nws": 23333, "superheroes</w>": 23334, "erit": 23335, "sleague</w>": 23336, "identification</w>": 23337, "motto</w>": 23338, "bai</w>": 23339, "sourced</w>": 23340, "iller": 23341, "api": 23342, "prise</w>": 23343, "unprecedented</w>": 23344, "damas": 23345, "tunisia</w>": 23346, "drain": 23347, "underestim": 23348, "ether": 23349, "quarterly</w>": 23350, "rewarding</w>": 23351, "alham": 23352, "wolverine</w>": 23353, "cabine": 23354, "hypno": 23355, "nadine</w>": 23356, "havana</w>": 23357, "dae": 23358, "ðŁĵĪ</w>": 23359, "dron</w>": 23360, "readings</w>": 23361, "bati": 23362, "pico</w>": 23363, "merci": 23364, "itian</w>": 23365, "walkers</w>": 23366, "elope</w>": 23367, "mikey</w>": 23368, "godzilla</w>": 23369, "burlington</w>": 23370, "abuja</w>": 23371, "socialism</w>": 23372, "atility</w>": 23373, "shell": 23374, "harrypotter</w>": 23375, "gno": 23376, "abur": 23377, "releg": 23378, "felici": 23379, "rogen</w>": 23380, "neuroscience</w>": 23381, "instin": 23382, "atham</w>": 23383, "vouchers</w>": 23384, "jarre": 23385, "fuse</w>": 23386, "defici": 23387, "monterey</w>": 23388, "deport": 23389, "midday</w>": 23390, "ppard</w>": 23391, "freed</w>": 23392, "ameter</w>": 23393, "wilt": 23394, "ningham</w>": 23395, "pratt</w>": 23396, "liberty": 23397, "slogan</w>": 23398, "oto</w>": 23399, "pri</w>": 23400, "coated</w>": 23401, "cpd</w>": 23402, "nett": 23403, "illas</w>": 23404, "malawi</w>": 23405, "evolve</w>": 23406, "accessibility</w>": 23407, "ðŁĶ¥ðŁĶ¥ðŁĶ¥ðŁĶ¥": 23408, "ornament</w>": 23409, "bp": 23410, "elis": 23411, "sonline</w>": 23412, "chiro": 23413, "flick</w>": 23414, "ibm": 23415, "arak": 23416, "enables</w>": 23417, "garland</w>": 23418, "sane</w>": 23419, "cuties</w>": 23420, "trip": 23421, "rotterdam</w>": 23422, "nys</w>": 23423, "lamps</w>": 23424, "lucas": 23425, "bog": 23426, "rails</w>": 23427, "travelled</w>": 23428, "hicks</w>": 23429, "enu": 23430, "sabha</w>": 23431, "scrub</w>": 23432, "hier": 23433, "hartford</w>": 23434, "foo</w>": 23435, "fernandez</w>": 23436, "trevor": 23437, "mattress</w>": 23438, "appointments</w>": 23439, "alej": 23440, "fei": 23441, "ologist</w>": 23442, "safar": 23443, "octa": 23444, "src</w>": 23445, "shaun": 23446, "ambient</w>": 23447, "dric</w>": 23448, "biker</w>": 23449, "shee": 23450, "mustache</w>": 23451, "hta": 23452, "boone</w>": 23453, "herty</w>": 23454, "cardio</w>": 23455, "brakes</w>": 23456, "recital</w>": 23457, "consists</w>": 23458, "overwhelmed</w>": 23459, "caul": 23460, "robbins</w>": 23461, "imit": 23462, "alth": 23463, "url</w>": 23464, "bibli": 23465, "onne</w>": 23466, "blacklivesmatter</w>": 23467, "difficulties</w>": 23468, "telang": 23469, "taller</w>": 23470, "ðŁĵĨ</w>": 23471, "debating</w>": 23472, "burrito</w>": 23473, "movember</w>": 23474, "strengthening</w>": 23475, "boe": 23476, "testam": 23477, "miracles</w>": 23478, "baseball": 23479, "renee</w>": 23480, "ðŁĳīðŁı»</w>": 23481, "alfa</w>": 23482, "âĺĺ": 23483, "unstoppable</w>": 23484, "ecs</w>": 23485, "gmo</w>": 23486, "giftideas</w>": 23487, "pathway</w>": 23488, "fencing</w>": 23489, "ðŁİ¤": 23490, "bham</w>": 23491, "ras": 23492, "sko</w>": 23493, "dled</w>": 23494, "thelast": 23495, "magnum</w>": 23496, "binary</w>": 23497, "wilde</w>": 23498, "wilder</w>": 23499, "whati": 23500, "barbecue</w>": 23501, "hism</w>": 23502, "canoe</w>": 23503, "kurdi": 23504, "elive</w>": 23505, "advantages</w>": 23506, "madame</w>": 23507, "bier</w>": 23508, "missing": 23509, "entertain</w>": 23510, "airforce</w>": 23511, "yama</w>": 23512, "cis</w>": 23513, "hashtags</w>": 23514, "jis</w>": 23515, "veil</w>": 23516, "dreamy</w>": 23517, "tense</w>": 23518, "mayward</w>": 23519, "chateau</w>": 23520, "huntington</w>": 23521, "âļĵ": 23522, "vall": 23523, "upon": 23524, "blouse</w>": 23525, "dunes</w>": 23526, "ðŁĺ´": 23527, "fertility</w>": 23528, "mole</w>": 23529, "currencies</w>": 23530, "stu</w>": 23531, "berlin": 23532, "toasted</w>": 23533, "divas</w>": 23534, "walt": 23535, "lark</w>": 23536, "pora</w>": 23537, "hitter</w>": 23538, "umer</w>": 23539, "chilled</w>": 23540, "balancing</w>": 23541, "fais": 23542, "yin</w>": 23543, "ortiz</w>": 23544, "eastenders</w>": 23545, "hate": 23546, "ural": 23547, "april": 23548, "timel": 23549, "à±": 23550, "pero</w>": 23551, "stocked</w>": 23552, "respects</w>": 23553, "tht</w>": 23554, "bestfriends</w>": 23555, "givingtuesday</w>": 23556, "bead</w>": 23557, "invent</w>": 23558, "imi</w>": 23559, "naples</w>": 23560, "combining</w>": 23561, "tokens</w>": 23562, "thirst</w>": 23563, "masc": 23564, "parrot</w>": 23565, "spu": 23566, "denton</w>": 23567, "*-*</w>": 23568, "tres</w>": 23569, "suburban</w>": 23570, "width</w>": 23571, "sive": 23572, "contender</w>": 23573, "sirius": 23574, "lok</w>": 23575, "troopers</w>": 23576, "outrage</w>": 23577, "turbo": 23578, "fragile</w>": 23579, "messed</w>": 23580, "doh</w>": 23581, "discord</w>": 23582, "netanyahu</w>": 23583, "resign</w>": 23584, "forgiveness</w>": 23585, "mohan</w>": 23586, "munch": 23587, "camou": 23588, "identifying</w>": 23589, "enabling</w>": 23590, "hotter</w>": 23591, "thornton</w>": 23592, "jaipur</w>": 23593, "arya</w>": 23594, "ðŁı»âĢįâĻĢï¸ı</w>": 23595, "mustaf": 23596, "majors</w>": 23597, "oke</w>": 23598, "duffy</w>": 23599, "rohing": 23600, "tilt</w>": 23601, "ðŁĩ®ðŁĩ³</w>": 23602, "rockstar": 23603, "sheep": 23604, "hendrix</w>": 23605, "rav</w>": 23606, "invention</w>": 23607, "dou</w>": 23608, "laguna</w>": 23609, "grumpy</w>": 23610, "swis": 23611, "impe": 23612, ")'</w>": 23613, "youths</w>": 23614, "bunker</w>": 23615, "stache</w>": 23616, "oppose</w>": 23617, "indies</w>": 23618, "accelerate</w>": 23619, "mlp</w>": 23620, "eden": 23621, "wann": 23622, "kail": 23623, "akshaykumar</w>": 23624, "supt</w>": 23625, "polym": 23626, "middleton</w>": 23627, "extraordin": 23628, "wilson": 23629, "australian": 23630, "aluminium</w>": 23631, "wayne": 23632, "alumnus</w>": 23633, "matics</w>": 23634, "grim</w>": 23635, "ernie</w>": 23636, "oppa</w>": 23637, "competitors</w>": 23638, "randall</w>": 23639, "hence</w>": 23640, "declares</w>": 23641, "preaching</w>": 23642, "shahe": 23643, "cane": 23644, "sustainable": 23645, "staples</w>": 23646, "ledge": 23647, "adena</w>": 23648, "doctoral</w>": 23649, "burgundy</w>": 23650, "decorate</w>": 23651, "rendered</w>": 23652, "risen</w>": 23653, "prank</w>": 23654, "dior</w>": 23655, "beethoven</w>": 23656, "floor": 23657, "accom": 23658, "tot</w>": 23659, "hodg": 23660, "tourism": 23661, "sayin</w>": 23662, "objective</w>": 23663, "markers</w>": 23664, "premiership</w>": 23665, "enabled</w>": 23666, "camoufla": 23667, "giant": 23668, "Ñģ": 23669, "smokey</w>": 23670, "ricket</w>": 23671, "pang</w>": 23672, "depending</w>": 23673, "sation</w>": 23674, "evolving</w>": 23675, "intercep": 23676, "census</w>": 23677, "tofthe": 23678, "reen</w>": 23679, "mendoza</w>": 23680, "trumpet</w>": 23681, "marketers</w>": 23682, "anit": 23683, "ðŁĻĬ": 23684, "northwestern</w>": 23685, "vla": 23686, "fotogra": 23687, "blackandwhite": 23688, "chewan</w>": 23689, "wig": 23690, "troom</w>": 23691, "gingerbread</w>": 23692, "kn</w>": 23693, "romero</w>": 23694, "nfc</w>": 23695, "orchi": 23696, "funko</w>": 23697, "source": 23698, "fs": 23699, "raped</w>": 23700, "ost": 23701, "tarot</w>": 23702, "annually</w>": 23703, "ðŁĺ¬": 23704, "rill</w>": 23705, "delav": 23706, "..!!</w>": 23707, "ses": 23708, "cann</w>": 23709, "medicare</w>": 23710, "phel": 23711, "apex</w>": 23712, "guardian": 23713, "remained</w>": 23714, "rpm</w>": 23715, "aÃ±": 23716, "storymonth</w>": 23717, "instagood</w>": 23718, "neighbour</w>": 23719, "ping": 23720, "semite</w>": 23721, "mystic</w>": 23722, "ascot</w>": 23723, "mater</w>": 23724, "handful</w>": 23725, "dangers</w>": 23726, "tid</w>": 23727, "anaheim</w>": 23728, "opoly</w>": 23729, "shallow</w>": 23730, "namibia</w>": 23731, "toria</w>": 23732, "procurement</w>": 23733, "bigbang</w>": 23734, "announcements</w>": 23735, "prosecutor</w>": 23736, "bengals</w>": 23737, "salle</w>": 23738, "enroll": 23739, "gastro": 23740, "suggestion</w>": 23741, "bak</w>": 23742, "haul": 23743, "buddhism</w>": 23744, "berniesanders</w>": 23745, "flute</w>": 23746, "fatigue</w>": 23747, "cynthia</w>": 23748, "choi</w>": 23749, "irwin</w>": 23750, "gua</w>": 23751, "strous</w>": 23752, "hp": 23753, "bap</w>": 23754, "satisfying</w>": 23755, "playa</w>": 23756, "ðŁİ¼</w>": 23757, "instap": 23758, "alice": 23759, "tp": 23760, "irrigation</w>": 23761, "ðŁĩ¬ðŁĩ§": 23762, "intric": 23763, "clues</w>": 23764, "plex": 23765, "sax</w>": 23766, "hepat": 23767, "dumped</w>": 23768, "significance</w>": 23769, "byu</w>": 23770, "medication</w>": 23771, "prov": 23772, "toughest</w>": 23773, "cornish</w>": 23774, "âŀľ</w>": 23775, "kelley</w>": 23776, "uv": 23777, "sizz": 23778, "sibling</w>": 23779, "mest</w>": 23780, "distor": 23781, "diplomatic</w>": 23782, "auntie</w>": 23783, "bhat": 23784, "sonic": 23785, "brenda</w>": 23786, "pumpkins</w>": 23787, "roch": 23788, "blackburn</w>": 23789, "urged</w>": 23790, "shia</w>": 23791, "arrangements</w>": 23792, "flood": 23793, "saunders</w>": 23794, "lecturer</w>": 23795, "nouri": 23796, "populations</w>": 23797, "diplomacy</w>": 23798, "consistently</w>": 23799, "ðŁ¤Ļ": 23800, "tmund</w>": 23801, "cauliflower</w>": 23802, "lily": 23803, "vocabulary</w>": 23804, "varieties</w>": 23805, "cooker</w>": 23806, "uptown</w>": 23807, "quent": 23808, "mosa</w>": 23809, "reinde": 23810, "velocity</w>": 23811, "spruce</w>": 23812, "socialmedi": 23813, "iber": 23814, "voluntary</w>": 23815, "processed</w>": 23816, "baltic</w>": 23817, "yang": 23818, "lebanese</w>": 23819, "dp": 23820, "dolly</w>": 23821, "arrangement</w>": 23822, "yuri</w>": 23823, "cranberry</w>": 23824, "kalyan</w>": 23825, "elevation</w>": 23826, "cliff": 23827, "pushes</w>": 23828, "ìĬ¤</w>": 23829, "silic": 23830, "cowx</w>": 23831, "eternity</w>": 23832, "slaves</w>": 23833, "vinegar</w>": 23834, "gloucester</w>": 23835, "contained</w>": 23836, "breakingnews</w>": 23837, "against": 23838, "renovated</w>": 23839, "normandy</w>": 23840, "heroin</w>": 23841, "ysm</w>": 23842, "mods</w>": 23843, "greek": 23844, "undi</w>": 23845, "trench</w>": 23846, "vh</w>": 23847, "encourages</w>": 23848, "headache</w>": 23849, "grange</w>": 23850, ":'</w>": 23851, "evergreen</w>": 23852, "ÙĬ</w>": 23853, "reckon</w>": 23854, "abused</w>": 23855, "thru": 23856, "choice": 23857, "tidy</w>": 23858, "colder</w>": 23859, "schoice</w>": 23860, "hain</w>": 23861, "brum": 23862, "liars</w>": 23863, "breit": 23864, "yorker</w>": 23865, "shack": 23866, "heidi</w>": 23867, "michaels</w>": 23868, "scopic</w>": 23869, "fascist</w>": 23870, "playful</w>": 23871, "cac</w>": 23872, "yasss</w>": 23873, "shad</w>": 23874, "..?</w>": 23875, "quen": 23876, "ramirez</w>": 23877, "clifton</w>": 23878, "prs</w>": 23879, "bestfan": 23880, "âģł": 23881, "generating</w>": 23882, "headset</w>": 23883, "disappointment</w>": 23884, "abstract": 23885, "boiled</w>": 23886, "parenthood</w>": 23887, "azerbaijan</w>": 23888, "exhibiting</w>": 23889, "bombay</w>": 23890, "olivier</w>": 23891, "koso": 23892, "unlea": 23893, "maternity</w>": 23894, "izer</w>": 23895, "sives</w>": 23896, "rhu": 23897, "coll</w>": 23898, "saskatchewan</w>": 23899, "freakin</w>": 23900, "dek</w>": 23901, "nag</w>": 23902, "stabili": 23903, "ðŁįķ</w>": 23904, "organizer</w>": 23905, "bosses</w>": 23906, "aru</w>": 23907, "uva</w>": 23908, "atable</w>": 23909, "taun": 23910, "afterwards</w>": 23911, "fertili": 23912, "verge</w>": 23913, "azi": 23914, "morph": 23915, "à¹ģà¸": 23916, "jerk</w>": 23917, "cosmetic</w>": 23918, "kow</w>": 23919, "strust</w>": 23920, "apache</w>": 23921, "postcards</w>": 23922, "formul": 23923, "ìĭ": 23924, "spinal</w>": 23925, "jackpot</w>": 23926, "electri": 23927, "ÃŃ</w>": 23928, "loy</w>": 23929, "grader</w>": 23930, "diablo</w>": 23931, "ardi": 23932, "hesit": 23933, "fw": 23934, "archery</w>": 23935, "pash": 23936, "theories</w>": 23937, "repeal</w>": 23938, "relive</w>": 23939, "percy</w>": 23940, "âĺĨ": 23941, "imin": 23942, "synchron": 23943, "shampoo</w>": 23944, "coupons</w>": 23945, "oto": 23946, "lai</w>": 23947, "thought": 23948, "luxembourg</w>": 23949, "mov</w>": 23950, "ðŁĺ¥</w>": 23951, "gemma</w>": 23952, "seated</w>": 23953, "mga</w>": 23954, "stratford</w>": 23955, "uncertainty</w>": 23956, "shifts</w>": 23957, "esto</w>": 23958, "fool": 23959, "firearms</w>": 23960, "corrie</w>": 23961, "kiki</w>": 23962, "apparent</w>": 23963, "pills</w>": 23964, "olympia</w>": 23965, "fid</w>": 23966, "elevated</w>": 23967, "decks</w>": 23968, "ignoring</w>": 23969, "avalan": 23970, "rov</w>": 23971, "whistle": 23972, "ptsd</w>": 23973, "militants</w>": 23974, "robotic</w>": 23975, "pacers</w>": 23976, "quilt</w>": 23977, "bankruptcy</w>": 23978, "lich": 23979, "percussion</w>": 23980, "celebrity": 23981, "als": 23982, "(;</w>": 23983, "sut": 23984, "pokemongo</w>": 23985, "hg": 23986, "offs</w>": 23987, "gibraltar</w>": 23988, "screams</w>": 23989, "billie</w>": 23990, "genome</w>": 23991, "marin</w>": 23992, "beams</w>": 23993, "archbishop</w>": 23994, "emin</w>": 23995, "bedrooms</w>": 23996, "gated</w>": 23997, "olly</w>": 23998, "warranty</w>": 23999, "atown</w>": 24000, "cuddles</w>": 24001, "gunna</w>": 24002, "kic": 24003, "vive</w>": 24004, "cymru</w>": 24005, "narrow": 24006, "prob</w>": 24007, "leo": 24008, "references</w>": 24009, "manufactured</w>": 24010, "chopper</w>": 24011, "brunswick</w>": 24012, "semis</w>": 24013, "donia</w>": 24014, "rye": 24015, "mano</w>": 24016, "hurting</w>": 24017, "?#</w>": 24018, "holli": 24019, "investigations</w>": 24020, "cels</w>": 24021, "ðŁĵŀ</w>": 24022, "lester</w>": 24023, "temples</w>": 24024, "storey</w>": 24025, "mcmahon</w>": 24026, "toilets</w>": 24027, "woof</w>": 24028, "ï¸İ</w>": 24029, "leverage</w>": 24030, "atom</w>": 24031, "nightmares</w>": 24032, "victorious</w>": 24033, "haunting</w>": 24034, "customer": 24035, "agi": 24036, "yoongi</w>": 24037, "monty</w>": 24038, "veronica</w>": 24039, "wur": 24040, "intimid": 24041, "blankets</w>": 24042, "volution</w>": 24043, "jm": 24044, "âĺİ": 24045, "amon</w>": 24046, "judith</w>": 24047, "ðŁĺİðŁĺİ</w>": 24048, "distracted</w>": 24049, "drip</w>": 24050, "hurricane": 24051, "andes</w>": 24052, "revelation</w>": 24053, "troop</w>": 24054, "ableg</w>": 24055, "collin": 24056, "tibetan</w>": 24057, "worrying</w>": 24058, "internationally</w>": 24059, "eater</w>": 24060, "cameroon</w>": 24061, "brador</w>": 24062, "yuk</w>": 24063, "ðŁĴĹðŁĴĹ</w>": 24064, "trak</w>": 24065, "slopes</w>": 24066, "cier": 24067, "nea</w>": 24068, "oler": 24069, "taka": 24070, "albion</w>": 24071, "volcanic</w>": 24072, "amn</w>": 24073, "afi": 24074, "obstac": 24075, "facetime</w>": 24076, "gering</w>": 24077, "npr</w>": 24078, "metallica</w>": 24079, "organic": 24080, "ðŁĴ¡</w>": 24081, "kidd</w>": 24082, "dances</w>": 24083, "pembro": 24084, "washer</w>": 24085, "mits</w>": 24086, "omer": 24087, "emotionally</w>": 24088, "tango</w>": 24089, "ipo</w>": 24090, "docks</w>": 24091, "scanning</w>": 24092, "specs</w>": 24093, "thom</w>": 24094, "theology</w>": 24095, "emergen": 24096, "omi": 24097, "gpa</w>": 24098, "selections</w>": 24099, "unnecessary</w>": 24100, "image": 24101, "ters": 24102, "induced</w>": 24103, "gigan": 24104, "rentals</w>": 24105, "supplied</w>": 24106, "mfa</w>": 24107, "shankar</w>": 24108, "later": 24109, "pajam": 24110, "clave</w>": 24111, "Ùģ": 24112, "mahin": 24113, "carlson</w>": 24114, "avian</w>": 24115, "anova</w>": 24116, "katie": 24117, "ajith</w>": 24118, "designated</w>": 24119, "chocolates</w>": 24120, "investigators</w>": 24121, "glazed</w>": 24122, "princess": 24123, "erry</w>": 24124, "ragn": 24125, "ourable</w>": 24126, "hru</w>": 24127, "sundance</w>": 24128, "peugeot</w>": 24129, "steampunk</w>": 24130, "ghlin</w>": 24131, "grease</w>": 24132, "hires</w>": 24133, "zap": 24134, "perce": 24135, "jill": 24136, "tome": 24137, "hehehe</w>": 24138, "joyful</w>": 24139, "maestro</w>": 24140, "nished</w>": 24141, "genealo": 24142, "vich</w>": 24143, "pits</w>": 24144, "foxes</w>": 24145, "goodman</w>": 24146, "emerson</w>": 24147, "lobes</w>": 24148, "converse</w>": 24149, "oats</w>": 24150, "thomson</w>": 24151, "rahim</w>": 24152, "malware</w>": 24153, "ahi</w>": 24154, "mankind</w>": 24155, "resin</w>": 24156, "img</w>": 24157, "swood</w>": 24158, "kinder</w>": 24159, "scroll</w>": 24160, "ara": 24161, "sakura</w>": 24162, "robbed</w>": 24163, "xion</w>": 24164, "nya</w>": 24165, "cism</w>": 24166, "cedar": 24167, "bein</w>": 24168, "mourning</w>": 24169, "torto": 24170, "heathrow</w>": 24171, "donegal</w>": 24172, "barb": 24173, "hydration</w>": 24174, "kor</w>": 24175, "elimination</w>": 24176, "supdates</w>": 24177, "hills": 24178, "appeti": 24179, "starred</w>": 24180, "kom</w>": 24181, "gwen</w>": 24182, "ddd</w>": 24183, "cray": 24184, "scanner</w>": 24185, "personalised</w>": 24186, "serenity</w>": 24187, "redesign</w>": 24188, "metaph": 24189, "boxed</w>": 24190, "judgment</w>": 24191, "nose": 24192, "ë¹": 24193, "erad": 24194, "acne</w>": 24195, "suppliers</w>": 24196, "energetic</w>": 24197, "vom": 24198, "asap": 24199, "ðŁĶ¸</w>": 24200, "irvine</w>": 24201, "hatch": 24202, "lass": 24203, "adren": 24204, "waffles</w>": 24205, "accurately</w>": 24206, "icio</w>": 24207, "ittle": 24208, "seun": 24209, "occupy</w>": 24210, "webcam": 24211, "thenew": 24212, "entes</w>": 24213, "gai": 24214, "jw</w>": 24215, "accountable</w>": 24216, "visor</w>": 24217, "irrit": 24218, "licensing</w>": 24219, "huddersfield</w>": 24220, "genie</w>": 24221, "ðŁİ¾</w>": 24222, "atmospheric</w>": 24223, "tensions</w>": 24224, "spartan</w>": 24225, "clifford</w>": 24226, "olan</w>": 24227, "northbound</w>": 24228, "ameen</w>": 24229, "censor": 24230, "uel</w>": 24231, "stery</w>": 24232, "$$": 24233, "farrell</w>": 24234, "hyster": 24235, "clt</w>": 24236, "sedan</w>": 24237, "replied</w>": 24238, "describing</w>": 24239, "microwave</w>": 24240, "slab</w>": 24241, "prosp": 24242, "assisting</w>": 24243, "rubio</w>": 24244, "ethan": 24245, "hhhhh</w>": 24246, "guay</w>": 24247, "zman</w>": 24248, "raise": 24249, "rolling": 24250, "oe": 24251, "nile</w>": 24252, "ambrose</w>": 24253, "scarborough</w>": 24254, "heroic</w>": 24255, "cooks</w>": 24256, "mort": 24257, "chopra</w>": 24258, "ðŁĮ·</w>": 24259, "tob</w>": 24260, "shaving</w>": 24261, "stacey</w>": 24262, "dorm</w>": 24263, "motorsports</w>": 24264, "wiki</w>": 24265, "folds</w>": 24266, "spiced</w>": 24267, "stressful</w>": 24268, "literal</w>": 24269, "fudge</w>": 24270, "peggy</w>": 24271, "waite</w>": 24272, "tresses</w>": 24273, "sesh</w>": 24274, "pric": 24275, "ðŁİħ</w>": 24276, "fright": 24277, "rva</w>": 24278, "mumbai": 24279, "pom</w>": 24280, "ttv</w>": 24281, "cellar</w>": 24282, "tome</w>": 24283, "android": 24284, "doris</w>": 24285, "tsunami</w>": 24286, "tinder</w>": 24287, "oec": 24288, "mwc</w>": 24289, "dortmund</w>": 24290, "nothin</w>": 24291, "liti": 24292, "sou</w>": 24293, "believein": 24294, "atu</w>": 24295, "knocks</w>": 24296, "magni": 24297, "sssss</w>": 24298, "rohit</w>": 24299, "inews</w>": 24300, "angi</w>": 24301, "mandy</w>": 24302, "kettle</w>": 24303, "intermediate</w>": 24304, "avant</w>": 24305, "curl</w>": 24306, "endorsed</w>": 24307, "orio</w>": 24308, "urt</w>": 24309, "consideration</w>": 24310, "wires</w>": 24311, "shelters</w>": 24312, "bino</w>": 24313, "vikram</w>": 24314, "implemented</w>": 24315, "lydia</w>": 24316, "buk</w>": 24317, "parody</w>": 24318, "cnews</w>": 24319, "undergraduate</w>": 24320, "canucks</w>": 24321, "sami</w>": 24322, "politically</w>": 24323, "rotten</w>": 24324, "ghz</w>": 24325, "textiles</w>": 24326, "overload</w>": 24327, "moderni": 24328, "recreational</w>": 24329, "flir": 24330, "baton</w>": 24331, "typography</w>": 24332, "ovation</w>": 24333, "intriguing</w>": 24334, "pilgrimage</w>": 24335, "alge": 24336, "adays</w>": 24337, "tcmparty</w>": 24338, "spelled</w>": 24339, "curls</w>": 24340, "booze</w>": 24341, "stem": 24342, "annes</w>": 24343, "irls</w>": 24344, "sponge</w>": 24345, "shopper</w>": 24346, "signation</w>": 24347, "brass": 24348, "mistress</w>": 24349, "leah": 24350, "beginner</w>": 24351, "lauderdale</w>": 24352, "august": 24353, "preschool</w>": 24354, "taping</w>": 24355, "taipei</w>": 24356, "executives</w>": 24357, "bd": 24358, "rhetor": 24359, "escor": 24360, "immuno": 24361, "deeplearning</w>": 24362, "statues</w>": 24363, "itus</w>": 24364, "manuscript</w>": 24365, "lyric": 24366, "corvette</w>": 24367, "molly": 24368, "lage</w>": 24369, "dep</w>": 24370, "cnbc</w>": 24371, "lest": 24372, "jessi": 24373, "fife</w>": 24374, "griffith</w>": 24375, "opposing</w>": 24376, "rang</w>": 24377, "drills</w>": 24378, "respectful</w>": 24379, "pity</w>": 24380, "dell": 24381, "harding</w>": 24382, "playboy</w>": 24383, "bloke</w>": 24384, "shutout</w>": 24385, "kili": 24386, "osp": 24387, "seattle": 24388, "bcpoli</w>": 24389, "mises</w>": 24390, "journals</w>": 24391, "teaming</w>": 24392, "esther</w>": 24393, "freddy</w>": 24394, "Ķï¸ı": 24395, "metrics</w>": 24396, "notre": 24397, "garry</w>": 24398, "forty</w>": 24399, "navigate</w>": 24400, "periods</w>": 24401, "benedic": 24402, "jid</w>": 24403, "daw</w>": 24404, "ancestors</w>": 24405, "restoring</w>": 24406, "cong</w>": 24407, "allergy</w>": 24408, "titanium</w>": 24409, "cence</w>": 24410, "leaning</w>": 24411, "abbas</w>": 24412, "vast": 24413, "ucf</w>": 24414, "roofing</w>": 24415, "eman": 24416, "severely</w>": 24417, "vogue": 24418, "veau</w>": 24419, "inbound</w>": 24420, "dz": 24421, "taneously</w>": 24422, "stretching</w>": 24423, "manchester": 24424, "dryer</w>": 24425, "davis": 24426, "kanth</w>": 24427, "thegame</w>": 24428, "itted</w>": 24429, "retain</w>": 24430, "elles</w>": 24431, "congestion</w>": 24432, "fraternity</w>": 24433, "ollie</w>": 24434, "loki</w>": 24435, "freely</w>": 24436, "choo</w>": 24437, "pony": 24438, "scep": 24439, "tably</w>": 24440, "balt</w>": 24441, "rockn": 24442, "dime</w>": 24443, "logging</w>": 24444, "ðŁį·": 24445, "adu</w>": 24446, "havoc</w>": 24447, "waterford</w>": 24448, "charis": 24449, "sweetie</w>": 24450, "running": 24451, "nerd": 24452, "erdogan</w>": 24453, "zara</w>": 24454, "weighing</w>": 24455, "fifty": 24456, "precise</w>": 24457, "lowell</w>": 24458, "kurdistan</w>": 24459, "ryo": 24460, "orth</w>": 24461, "synth</w>": 24462, "liners</w>": 24463, "phenomenon</w>": 24464, "artillery</w>": 24465, "illegally</w>": 24466, "construct</w>": 24467, "nostalgic</w>": 24468, "garth</w>": 24469, "alta": 24470, "shelton</w>": 24471, "asean</w>": 24472, "wander</w>": 24473, "durban</w>": 24474, "diversi": 24475, "bono</w>": 24476, "clon": 24477, "leman</w>": 24478, "shun": 24479, "obstacles</w>": 24480, "appetite</w>": 24481, "feeder</w>": 24482, "respiratory</w>": 24483, "dixie</w>": 24484, "formula": 24485, "anto</w>": 24486, "sober</w>": 24487, "extinct</w>": 24488, "auc": 24489, "ingles</w>": 24490, "legitimate</w>": 24491, ";;</w>": 24492, "minnie</w>": 24493, "ipswich</w>": 24494, "dramatically</w>": 24495, "ðŁĳıðŁı¼": 24496, "ingham": 24497, "military": 24498, "monet</w>": 24499, "usnavy</w>": 24500, "fork": 24501, "dunno</w>": 24502, "player": 24503, "qotd</w>": 24504, "stoo": 24505, "exor": 24506, "ethiopian</w>": 24507, "filmfest</w>": 24508, "pered</w>": 24509, "cate</w>": 24510, "saudi": 24511, "inner": 24512, "sincere": 24513, "tionality</w>": 24514, "alee</w>": 24515, "deeds</w>": 24516, "cooperative</w>": 24517, "ironic</w>": 24518, "crocod": 24519, "brary</w>": 24520, "postseason</w>": 24521, "camper</w>": 24522, "canary</w>": 24523, "ein</w>": 24524, "extensions</w>": 24525, "nbd</w>": 24526, "sherwood</w>": 24527, "spokane</w>": 24528, "hump</w>": 24529, "jitsu</w>": 24530, "ê¹": 24531, "daryl</w>": 24532, "psi</w>": 24533, "stabbed</w>": 24534, "offerings</w>": 24535, "expects</w>": 24536, "caval": 24537, "bodybuilding</w>": 24538, "framing</w>": 24539, "fca</w>": 24540, "yearly</w>": 24541, "bombed</w>": 24542, "skil": 24543, "researching</w>": 24544, "judiciary</w>": 24545, "greeted</w>": 24546, "tudor</w>": 24547, "milo</w>": 24548, "innovate</w>": 24549, "ðŁĺĽ": 24550, "rhs</w>": 24551, "ruby": 24552, "contributor</w>": 24553, "famer</w>": 24554, "socially</w>": 24555, "mlin</w>": 24556, "fiery</w>": 24557, "utter</w>": 24558, "beaut</w>": 24559, "itos</w>": 24560, "devoted</w>": 24561, "rainbow": 24562, "barney</w>": 24563, "peren": 24564, "arjun": 24565, "rna</w>": 24566, "gabby</w>": 24567, "uti</w>": 24568, "hannity</w>": 24569, "pickle</w>": 24570, "serv</w>": 24571, "quakes</w>": 24572, "ppe": 24573, "fem": 24574, "whitec": 24575, "jn": 24576, "victories</w>": 24577, "ðŁ§¡</w>": 24578, "golfer</w>": 24579, "congratulates</w>": 24580, "resulting</w>": 24581, "mechanic</w>": 24582, "urve": 24583, "centered</w>": 24584, "kiev</w>": 24585, "ans": 24586, "incub": 24587, "<<": 24588, "cmo</w>": 24589, "bestfanarmy</w>": 24590, "daph": 24591, "enham</w>": 24592, "oncology</w>": 24593, "kush</w>": 24594, "txt</w>": 24595, "oriented</w>": 24596, "fashionable</w>": 24597, "csr</w>": 24598, "sahara</w>": 24599, "rack": 24600, "pdp</w>": 24601, "hanson</w>": 24602, "à¸ĩ": 24603, "tiers</w>": 24604, "rar</w>": 24605, "panam": 24606, "insky</w>": 24607, "sahi": 24608, "testament</w>": 24609, "asthma</w>": 24610, "inher": 24611, "fisheries</w>": 24612, "order": 24613, "howe</w>": 24614, "gallon</w>": 24615, "epis": 24616, "suzanne</w>": 24617, "drowning</w>": 24618, "panelists</w>": 24619, "ðŁĺ²</w>": 24620, "ë¦": 24621, "alach": 24622, "commemorative</w>": 24623, "attribu": 24624, "ðŁĳ»": 24625, "moo</w>": 24626, "visional</w>": 24627, "weeksary</w>": 24628, "gust</w>": 24629, "akin": 24630, "pointe</w>": 24631, "eee": 24632, "dispar": 24633, "nipp": 24634, "dental": 24635, "stall": 24636, "pian</w>": 24637, "bore</w>": 24638, "ulster</w>": 24639, "tick": 24640, "irr": 24641, "taehyung</w>": 24642, "microphone</w>": 24643, "bermuda</w>": 24644, "gaard</w>": 24645, "eler</w>": 24646, "plumbing</w>": 24647, "hugely</w>": 24648, "âļ«ï¸ı</w>": 24649, "raceway</w>": 24650, "cambridge": 24651, "marcel</w>": 24652, "burnley</w>": 24653, "toast": 24654, "hollywood": 24655, "fasting</w>": 24656, "mered": 24657, "hibition</w>": 24658, "capped</w>": 24659, "beneficial</w>": 24660, "owning</w>": 24661, "contamin": 24662, "arabian</w>": 24663, "toon": 24664, "capac": 24665, "hulu</w>": 24666, "smir": 24667, "nutrients</w>": 24668, "sein</w>": 24669, "graphs</w>": 24670, "conditional</w>": 24671, "ðŁĳħ</w>": 24672, "orac": 24673, "playin</w>": 24674, "northe": 24675, "tornad": 24676, "marian</w>": 24677, "jumbo</w>": 24678, "lexi</w>": 24679, "incredibleindia</w>": 24680, "roadto": 24681, "ukone</w>": 24682, "confusing</w>": 24683, "sph</w>": 24684, "shank</w>": 24685, "pied": 24686, "mqm</w>": 24687, "positively</w>": 24688, "sherry</w>": 24689, "pathways</w>": 24690, "considers</w>": 24691, "tofu</w>": 24692, "arguments</w>": 24693, "resilient</w>": 24694, "chett</w>": 24695, "withdra": 24696, "tero</w>": 24697, "atedly</w>": 24698, "swana</w>": 24699, "heb": 24700, "flight": 24701, "harley": 24702, "decrease</w>": 24703, "kindle": 24704, "bookshop</w>": 24705, "³ï¸ı</w>": 24706, "martyrs</w>": 24707, "smur": 24708, "mccl": 24709, "concerto</w>": 24710, "stime</w>": 24711, "rejoice</w>": 24712, "applau": 24713, "clement</w>": 24714, "merkel</w>": 24715, "jaime</w>": 24716, "immortal</w>": 24717, "isleof": 24718, "marco": 24719, "youtuber</w>": 24720, "stalking</w>": 24721, "metoo</w>": 24722, "stack": 24723, "spouse</w>": 24724, "ust</w>": 24725, "luv": 24726, "âļ¾ï¸ı": 24727, "equestrian</w>": 24728, "eving</w>": 24729, "flin": 24730, "nickname</w>": 24731, "thebig": 24732, "asar": 24733, "stacks</w>": 24734, "walker": 24735, "bora</w>": 24736, "kidnapped</w>": 24737, "hurling</w>": 24738, "humbold": 24739, "recalls</w>": 24740, "copper": 24741, "annis</w>": 24742, "seo": 24743, "merger</w>": 24744, "muir</w>": 24745, "addy</w>": 24746, "ðŁĴªðŁĴª</w>": 24747, "bex</w>": 24748, "cracy</w>": 24749, "conan</w>": 24750, "congratulation</w>": 24751, "midst</w>": 24752, "âĻ¬</w>": 24753, "forbi": 24754, "optic</w>": 24755, "crate</w>": 24756, "crocodile</w>": 24757, "madagas": 24758, "securing</w>": 24759, "aston": 24760, "ogue</w>": 24761, "savior</w>": 24762, "salisbury</w>": 24763, "loveit</w>": 24764, "fujifilm</w>": 24765, "castles</w>": 24766, "asst</w>": 24767, "arrows</w>": 24768, "spacious</w>": 24769, "trs</w>": 24770, "polyvore</w>": 24771, "progression</w>": 24772, "mri</w>": 24773, "nelson": 24774, "bim</w>": 24775, "indicator</w>": 24776, "oda</w>": 24777, "pepe</w>": 24778, "resignation</w>": 24779, "gut": 24780, "sneaker</w>": 24781, "logically</w>": 24782, "azy</w>": 24783, "arella</w>": 24784, "tearing</w>": 24785, "joshi</w>": 24786, "ssionism</w>": 24787, "qpr</w>": 24788, "mariah</w>": 24789, "px": 24790, "bleed</w>": 24791, "mian</w>": 24792, "medley</w>": 24793, "weiss</w>": 24794, "kerry": 24795, "gatory</w>": 24796, "atal</w>": 24797, "madison": 24798, "avenger</w>": 24799, "naby</w>": 24800, "pland</w>": 24801, "giles</w>": 24802, "freshwater</w>": 24803, "dington</w>": 24804, "taj</w>": 24805, "demonstrates</w>": 24806, "ntv": 24807, "bulbs</w>": 24808, "sundaymorning</w>": 24809, "peake</w>": 24810, "souvenir</w>": 24811, "wah": 24812, "tonnes</w>": 24813, "mkt</w>": 24814, "complexity</w>": 24815, "conden": 24816, "rossi</w>": 24817, "bing": 24818, "yds</w>": 24819, "suk": 24820, "ngo</w>": 24821, "midland</w>": 24822, "oly</w>": 24823, "lifeis": 24824, "ripple</w>": 24825, "moreno</w>": 24826, "dders</w>": 24827, "tus": 24828, "áĥ": 24829, "boul": 24830, "xa</w>": 24831, "holdings</w>": 24832, "wny</w>": 24833, "shadowhunters</w>": 24834, "kei</w>": 24835, "aspire</w>": 24836, "mous": 24837, "owen": 24838, "soak</w>": 24839, "skirts</w>": 24840, "mountaine": 24841, "storming</w>": 24842, "chrome": 24843, "riots</w>": 24844, "sarato": 24845, "amaze</w>": 24846, "lessness</w>": 24847, "navar": 24848, "criteria</w>": 24849, "rafa</w>": 24850, "indulge</w>": 24851, "ayer</w>": 24852, "porto": 24853, "namo</w>": 24854, "................": 24855, "yields</w>": 24856, "valle": 24857, "jh": 24858, "macron</w>": 24859, "sains": 24860, "durant</w>": 24861, "trailers</w>": 24862, "wot</w>": 24863, "confederate</w>": 24864, "shrin": 24865, "idol": 24866, "formally</w>": 24867, "tene": 24868, "motorcycles</w>": 24869, "thang</w>": 24870, "node</w>": 24871, "banger</w>": 24872, "daly</w>": 24873, "pats</w>": 24874, "enrollment</w>": 24875, "auctions</w>": 24876, "atal": 24877, "arbor</w>": 24878, "logos</w>": 24879, "dearest</w>": 24880, "transaction</w>": 24881, "domingo</w>": 24882, "flea</w>": 24883, "sermon</w>": 24884, "deck": 24885, "sincere</w>": 24886, "questioning</w>": 24887, "julio</w>": 24888, "wasp</w>": 24889, "pretz": 24890, "armenian</w>": 24891, "kham": 24892, "inflammation</w>": 24893, "picturesque</w>": 24894, "accidental</w>": 24895, "filmmakers</w>": 24896, "ðŁĺļ": 24897, "ðŁĴį</w>": 24898, "casey": 24899, "sob": 24900, "yeezy</w>": 24901, "goodwill</w>": 24902, "paragra": 24903, "ssly</w>": 24904, "feather": 24905, "dyed</w>": 24906, "assassination</w>": 24907, "nade": 24908, "bcs</w>": 24909, "applies</w>": 24910, "feminine</w>": 24911, "feu": 24912, "extent</w>": 24913, "deputies</w>": 24914, "lack": 24915, "psychic</w>": 24916, "goi</w>": 24917, "killings</w>": 24918, "pseu": 24919, "ðŁ¤ª</w>": 24920, "unc": 24921, "marl": 24922, "tane</w>": 24923, "mckenna</w>": 24924, "surfer</w>": 24925, "influences</w>": 24926, "freeway</w>": 24927, "hackney</w>": 24928, "malaria</w>": 24929, "eland": 24930, "teau</w>": 24931, "remastered</w>": 24932, "Ø±</w>": 24933, "razor": 24934, "ggy": 24935, "corro": 24936, "laksh": 24937, "flair</w>": 24938, "honesty</w>": 24939, "hooray</w>": 24940, "depp</w>": 24941, "amc": 24942, "wednesdays</w>": 24943, "qa": 24944, "edits</w>": 24945, "-$</w>": 24946, "sevilla</w>": 24947, "doubled</w>": 24948, "humanities</w>": 24949, "ccot</w>": 24950, "somos</w>": 24951, "rine</w>": 24952, "afa</w>": 24953, "sioux</w>": 24954, "reconstruction</w>": 24955, "welding</w>": 24956, "threads</w>": 24957, "amish</w>": 24958, "encouragement</w>": 24959, "poder": 24960, "bock</w>": 24961, "balm</w>": 24962, "ptions</w>": 24963, "standup</w>": 24964, "accomplishments</w>": 24965, "guarding</w>": 24966, "conviction</w>": 24967, "acion</w>": 24968, "napoleon</w>": 24969, "depicting</w>": 24970, "attack": 24971, "sui</w>": 24972, "wearable</w>": 24973, "âĸªï¸ı</w>": 24974, "potter": 24975, "escort</w>": 24976, "vise</w>": 24977, "tots</w>": 24978, "boon": 24979, "eventprofs</w>": 24980, "angular</w>": 24981, "womenshistorymonth</w>": 24982, "barrow</w>": 24983, "schi</w>": 24984, "accomp": 24985, "tik": 24986, "lend</w>": 24987, "kensington</w>": 24988, "wolfe</w>": 24989, "stacked</w>": 24990, "crashing</w>": 24991, "exhibit": 24992, "winged</w>": 24993, "sabrina</w>": 24994, "masa": 24995, "kms</w>": 24996, "always": 24997, "ett</w>": 24998, "plasma</w>": 24999, "counseling</w>": 25000, "pickles</w>": 25001, "nfldraft</w>": 25002, "mrs": 25003, "inevitable</w>": 25004, "courageous</w>": 25005, "stafford</w>": 25006, "writerslife</w>": 25007, "hos</w>": 25008, "ej</w>": 25009, "ghyun</w>": 25010, "trademark</w>": 25011, "adrian": 25012, "influencer</w>": 25013, "coronation</w>": 25014, "raging</w>": 25015, "explored</w>": 25016, "usaf</w>": 25017, "exception</w>": 25018, "eux</w>": 25019, "tanker</w>": 25020, "swami</w>": 25021, "packet</w>": 25022, "ðŁĳ¨âĢį": 25023, "fen</w>": 25024, "sheen</w>": 25025, "aero</w>": 25026, "jl": 25027, "regal</w>": 25028, "nwt</w>": 25029, "auster": 25030, "mehta</w>": 25031, "charge": 25032, "aste": 25033, "bate": 25034, "infeld</w>": 25035, "racecourse</w>": 25036, "collapsed</w>": 25037, "fleece</w>": 25038, "zil": 25039, "allie</w>": 25040, "alternatives</w>": 25041, "georges</w>": 25042, "ðŁĵį": 25043, "quirky</w>": 25044, "fcb</w>": 25045, "natgeo</w>": 25046, "philanthropy</w>": 25047, "brai": 25048, "everyday": 25049, "ðŁĲ°</w>": 25050, "achers</w>": 25051, "jaan</w>": 25052, "fines</w>": 25053, "qi": 25054, "fisherman</w>": 25055, "distinct</w>": 25056, "grimes</w>": 25057, "nationalist</w>": 25058, "commence</w>": 25059, "rown</w>": 25060, "âĢ³</w>": 25061, "zing": 25062, "fter</w>": 25063, "hrw</w>": 25064, "baroque</w>": 25065, "blender</w>": 25066, "kitty": 25067, "hooks</w>": 25068, "cited</w>": 25069, "wanda</w>": 25070, "consensus</w>": 25071, "reindeer</w>": 25072, "anand": 25073, "supply": 25074, "meds</w>": 25075, "vn</w>": 25076, "olph</w>": 25077, "ratchet</w>": 25078, "sheldon</w>": 25079, "securities</w>": 25080, "ë°©íĥ": 25081, "crom": 25082, "mosquito</w>": 25083, "jeric": 25084, "immac": 25085, "dimensions</w>": 25086, "â¤": 25087, "dissi": 25088, "spongebob</w>": 25089, "damien</w>": 25090, "stevenson</w>": 25091, "joanne</w>": 25092, "delish</w>": 25093, "yikes</w>": 25094, "thanx</w>": 25095, "surveys</w>": 25096, "postponed</w>": 25097, "alcoholic</w>": 25098, "alised</w>": 25099, "ðŁĻıðŁı»": 25100, "doch</w>": 25101, "sentim": 25102, "meredith</w>": 25103, "compares</w>": 25104, "bago</w>": 25105, "happydays</w>": 25106, "moss": 25107, "ãħĭ</w>": 25108, "nec": 25109, "gnment</w>": 25110, "frustrated</w>": 25111, "combin": 25112, "riv": 25113, "eclec": 25114, "collo": 25115, "compliment</w>": 25116, "actorslife</w>": 25117, "ctto</w>": 25118, "nicar": 25119, "ophon": 25120, "aparthe": 25121, "mant": 25122, "jade": 25123, "trolley</w>": 25124, "optimization</w>": 25125, "eyeon</w>": 25126, "ecological</w>": 25127, "quist</w>": 25128, "ephe": 25129, "à¥ĩ</w>": 25130, "cinco</w>": 25131, "appoints</w>": 25132, "oldschool</w>": 25133, "cpr</w>": 25134, "behavioral</w>": 25135, "minaj</w>": 25136, ":-(</w>": 25137, "tagging</w>": 25138, "eval": 25139, "joaqu": 25140, "ðŁĺ«": 25141, "hak": 25142, "deme": 25143, "jamaican</w>": 25144, "sos": 25145, "hyatt</w>": 25146, "handbook</w>": 25147, "librarian</w>": 25148, "hannibal</w>": 25149, "pumping</w>": 25150, "chom": 25151, "fman</w>": 25152, "gai</w>": 25153, "hull": 25154, "responders</w>": 25155, "greenville</w>": 25156, "nus": 25157, "vaugh": 25158, "ðŁİīðŁİī": 25159, "taxi": 25160, "goldberg</w>": 25161, "mantra</w>": 25162, "tease</w>": 25163, "forbidden</w>": 25164, "methodist</w>": 25165, "ativity</w>": 25166, "****</w>": 25167, "ect</w>": 25168, "mcgr": 25169, "Ħëĭ": 25170, "seb</w>": 25171, "amidst</w>": 25172, "disappear</w>": 25173, "thyro": 25174, "philips</w>": 25175, "erina</w>": 25176, "vicious</w>": 25177, "streamer</w>": 25178, "millionaire</w>": 25179, "map": 25180, "strick": 25181, "hackathon</w>": 25182, "gha</w>": 25183, "edic": 25184, "mika</w>": 25185, "peck": 25186, "illi</w>": 25187, "antoine</w>": 25188, "arca": 25189, "optic": 25190, "maure": 25191, "ðŁĩ¦ðŁĩº</w>": 25192, "clashes</w>": 25193, "manly</w>": 25194, "âĺģ": 25195, "alvar": 25196, "andres</w>": 25197, "mei</w>": 25198, "elm": 25199, "wwww</w>": 25200, "altered</w>": 25201, "lte</w>": 25202, "ê¹Ģ": 25203, "mojo</w>": 25204, "forrest</w>": 25205, "thalai": 25206, "nont</w>": 25207, "speeches</w>": 25208, "acknowledge</w>": 25209, "ignite</w>": 25210, "xfactor</w>": 25211, "ðŁ¥Ĥ</w>": 25212, "meadow": 25213, "disrupt</w>": 25214, "debuted</w>": 25215, "scrimmage</w>": 25216, "pharmaceutical</w>": 25217, "fidd": 25218, "foundations</w>": 25219, "philosopher</w>": 25220, "etal</w>": 25221, "publishers</w>": 25222, "boys": 25223, "cke": 25224, "rugged</w>": 25225, "optimism</w>": 25226, "rebe": 25227, "philharmon": 25228, "narcis": 25229, "rallies</w>": 25230, "luis": 25231, "goblue</w>": 25232, "folded</w>": 25233, "unacceptable</w>": 25234, "optimal</w>": 25235, "lisa": 25236, "polaro": 25237, "+.</w>": 25238, "enza</w>": 25239, "âĿ£ï¸ı</w>": 25240, "monopoly</w>": 25241, "graceful</w>": 25242, "dairy": 25243, "dua</w>": 25244, "difficulty</w>": 25245, "judgement</w>": 25246, "osi": 25247, "mersey": 25248, "flux</w>": 25249, "newfound": 25250, "terns</w>": 25251, "dimensional</w>": 25252, "invic": 25253, "alba</w>": 25254, "amit</w>": 25255, "abudhabi</w>": 25256, "algeria</w>": 25257, "automobile</w>": 25258, "thead</w>": 25259, "lotion</w>": 25260, "accelerator</w>": 25261, "vacant</w>": 25262, "ition": 25263, "luf": 25264, "alic": 25265, "pll</w>": 25266, "blazing</w>": 25267, "baz</w>": 25268, "sene": 25269, "ðŁĳ¼": 25270, "villains</w>": 25271, "directory</w>": 25272, "eisen": 25273, "tock</w>": 25274, "brochure</w>": 25275, "ripp": 25276, "hbd": 25277, "zaynmalik</w>": 25278, "niche</w>": 25279, "lolol</w>": 25280, "certificates</w>": 25281, "morse</w>": 25282, "facup</w>": 25283, "xham</w>": 25284, "unwanted</w>": 25285, "imports</w>": 25286, "carnegie</w>": 25287, "fansign</w>": 25288, "mou</w>": 25289, "ralph": 25290, "destroyer</w>": 25291, "swing": 25292, "trekking</w>": 25293, "ciliation</w>": 25294, "pitbull</w>": 25295, "gaps</w>": 25296, "howell</w>": 25297, "definitive</w>": 25298, "mcle": 25299, "fps</w>": 25300, "etz</w>": 25301, "bolly": 25302, "lynn": 25303, "gano</w>": 25304, "ature": 25305, "fursuit": 25306, "coil</w>": 25307, "nav</w>": 25308, "butts</w>": 25309, "trojans</w>": 25310, "eure": 25311, "enko</w>": 25312, "schumer</w>": 25313, "horrific</w>": 25314, "installment</w>": 25315, "brb</w>": 25316, "suburbs</w>": 25317, "abel</w>": 25318, "vir</w>": 25319, "desh": 25320, "cunningham</w>": 25321, "ðŁĲ»</w>": 25322, "spann</w>": 25323, "schwe": 25324, "kemp</w>": 25325, "tru</w>": 25326, "stealth</w>": 25327, "ques": 25328, "lew</w>": 25329, "delights</w>": 25330, "koch</w>": 25331, "humili": 25332, "criti": 25333, "ilt</w>": 25334, "spells</w>": 25335, "miley": 25336, "caric": 25337, "ðŁį´</w>": 25338, "lcfc</w>": 25339, "substitute</w>": 25340, "oung</w>": 25341, "?!!</w>": 25342, "affir": 25343, "predictable</w>": 25344, "classof</w>": 25345, "err</w>": 25346, "cypress</w>": 25347, "chandra</w>": 25348, "ageing</w>": 25349, "____</w>": 25350, "therland</w>": 25351, "doncaster</w>": 25352, "elin": 25353, "yoshi</w>": 25354, "sailors</w>": 25355, "harris": 25356, "joanna</w>": 25357, "nigerians</w>": 25358, "hers</w>": 25359, "plague</w>": 25360, "procra": 25361, "kno</w>": 25362, "canton</w>": 25363, "busines": 25364, "unh": 25365, "prakash</w>": 25366, "cin</w>": 25367, "bowen</w>": 25368, "coating</w>": 25369, "mals</w>": 25370, "begging</w>": 25371, "smithson": 25372, "pontiac</w>": 25373, "spies</w>": 25374, "damian</w>": 25375, "pline</w>": 25376, "undant</w>": 25377, "alta</w>": 25378, "oness</w>": 25379, "shameless</w>": 25380, "daq</w>": 25381, "bbm</w>": 25382, "wales": 25383, "stampede</w>": 25384, "serum</w>": 25385, "ÙĨ</w>": 25386, "catalyst</w>": 25387, "xn</w>": 25388, "absc": 25389, "freezer</w>": 25390, "chun</w>": 25391, "arios</w>": 25392, "mccre": 25393, "forehead</w>": 25394, "hears</w>": 25395, "damascus</w>": 25396, "tacoma</w>": 25397, "arduino</w>": 25398, "encounters</w>": 25399, "stanton</w>": 25400, "lgb": 25401, "abas": 25402, "\"..</w>": 25403, "kete": 25404, "dracula</w>": 25405, "elem</w>": 25406, "gne</w>": 25407, "zeppelin</w>": 25408, "labrador</w>": 25409, "pulp</w>": 25410, "optional</w>": 25411, "orn": 25412, "russians</w>": 25413, "sanitation</w>": 25414, "hilary</w>": 25415, "etsymntt</w>": 25416, "penalties</w>": 25417, "aust</w>": 25418, "igans</w>": 25419, "olympian</w>": 25420, "medicaid</w>": 25421, "versace</w>": 25422, "vape": 25423, "restra": 25424, "peep": 25425, "sexiest</w>": 25426, "stalls</w>": 25427, "dile": 25428, "thea</w>": 25429, "punjabi</w>": 25430, "puppy": 25431, "tuesdaymotivation</w>": 25432, "ðŁĵļ": 25433, "theflash</w>": 25434, "rocket": 25435, "modest</w>": 25436, "chihuahu": 25437, "onna": 25438, "ksa</w>": 25439, "hurdles</w>": 25440, "cave": 25441, "failures</w>": 25442, "split": 25443, "boho</w>": 25444, "gurl</w>": 25445, "disappoint</w>": 25446, "howard": 25447, "nugget</w>": 25448, "franz</w>": 25449, "stalert</w>": 25450, "kazakh": 25451, "forgetting</w>": 25452, "schri": 25453, "agate</w>": 25454, "amat</w>": 25455, "everett</w>": 25456, "duet</w>": 25457, "veterinary</w>": 25458, "julian": 25459, "chills</w>": 25460, "brave": 25461, "ghostbusters</w>": 25462, "lando": 25463, "greets</w>": 25464, "profitable</w>": 25465, "dÃ©": 25466, "tir": 25467, "zee": 25468, "omen</w>": 25469, "pdx": 25470, "grayson</w>": 25471, "hari": 25472, "fixes</w>": 25473, "stabbing</w>": 25474, "swimmer</w>": 25475, "symbols</w>": 25476, "compliments</w>": 25477, "pose": 25478, "functioning</w>": 25479, "thnx</w>": 25480, "gir</w>": 25481, "corporations</w>": 25482, "barlow</w>": 25483, "loe</w>": 25484, "offseason</w>": 25485, "distinctive</w>": 25486, "marvelous</w>": 25487, "nikon": 25488, "enrique</w>": 25489, "kyu</w>": 25490, "jaws</w>": 25491, "amoto</w>": 25492, "lombar": 25493, "travelblogger</w>": 25494, "fah": 25495, "ourism</w>": 25496, "tristan</w>": 25497, "soe</w>": 25498, "cease</w>": 25499, "ðŁıħ</w>": 25500, "zac": 25501, "mckenzie</w>": 25502, "taxpayers</w>": 25503, "swimsuit</w>": 25504, "blo</w>": 25505, "lesley</w>": 25506, "kansas": 25507, "wks</w>": 25508, "kiel</w>": 25509, "provoking</w>": 25510, "myles</w>": 25511, "string": 25512, "kangaroo</w>": 25513, "galactic</w>": 25514, "fifth": 25515, "ske</w>": 25516, "weir</w>": 25517, "llis</w>": 25518, "matory</w>": 25519, "ðŁĩ¿": 25520, "unci": 25521, "reproductive</w>": 25522, "rooting</w>": 25523, "tides</w>": 25524, "gadget</w>": 25525, "..........</w>": 25526, "alexander": 25527, "bowler</w>": 25528, "screw": 25529, "apolog": 25530, "erika</w>": 25531, "walters</w>": 25532, "shetty</w>": 25533, "lane": 25534, "banter</w>": 25535, "asant</w>": 25536, "meso": 25537, "vain</w>": 25538, "\"\"\"</w>": 25539, "usi</w>": 25540, "ferdin": 25541, "accomplish</w>": 25542, "mansfield</w>": 25543, "bombar": 25544, "collaborating</w>": 25545, "clap</w>": 25546, "iture</w>": 25547, "sda</w>": 25548, "smoky</w>": 25549, "nak</w>": 25550, "imperson": 25551, "carla</w>": 25552, "comra": 25553, "burgl": 25554, "loco</w>": 25555, "ties": 25556, "inhi": 25557, "tracey</w>": 25558, "seis": 25559, "disser": 25560, "rrrr": 25561, "dray": 25562, "protect": 25563, "corona</w>": 25564, "hunger": 25565, "cken": 25566, "celi": 25567, "troubled</w>": 25568, "predators</w>": 25569, "fictional</w>": 25570, "shaved</w>": 25571, "richest</w>": 25572, "metaboli": 25573, "fulham</w>": 25574, "grooming</w>": 25575, "monochrome</w>": 25576, "wasting</w>": 25577, "asco": 25578, "aste</w>": 25579, "tista</w>": 25580, "remedies</w>": 25581, "ungsoo</w>": 25582, "southend</w>": 25583, "permanently</w>": 25584, "bumble": 25585, "procrastin": 25586, "identical</w>": 25587, "practically</w>": 25588, "mascul": 25589, "suke</w>": 25590, "assured</w>": 25591, "valerie</w>": 25592, "deviant": 25593, "grizzlies</w>": 25594, "thier</w>": 25595, "pura</w>": 25596, "nepal": 25597, "notts</w>": 25598, "bilateral</w>": 25599, "spoil</w>": 25600, "carmel</w>": 25601, "cinematic</w>": 25602, "phl</w>": 25603, "nifty</w>": 25604, "mao</w>": 25605, "hypocri": 25606, "laser": 25607, "pantry</w>": 25608, "mathematical</w>": 25609, "elisa": 25610, "coordination</w>": 25611, "belmont</w>": 25612, "ait": 25613, "radiant</w>": 25614, "boiler</w>": 25615, "mang": 25616, "fag": 25617, "crc</w>": 25618, "hams</w>": 25619, "brin": 25620, "â¬ĩï¸ı": 25621, "familia</w>": 25622, "âĿ£</w>": 25623, "saber</w>": 25624, "rupert</w>": 25625, "ggan</w>": 25626, "ritz</w>": 25627, "mich": 25628, "salford</w>": 25629, "levi": 25630, "gral</w>": 25631, "ðŁĴ¤</w>": 25632, "nino</w>": 25633, "ced": 25634, "businessman</w>": 25635, "ultr": 25636, "simply": 25637, "compression</w>": 25638, "pains</w>": 25639, "halt</w>": 25640, "ë°©íĥĦ": 25641, "landscaping</w>": 25642, "nf</w>": 25643, "crooked</w>": 25644, "erd</w>": 25645, "ittin</w>": 25646, "ddleston</w>": 25647, "surpassed</w>": 25648, "inoa</w>": 25649, "dag</w>": 25650, "blen": 25651, "extending</w>": 25652, "ating": 25653, "algae</w>": 25654, "baller</w>": 25655, "umar</w>": 25656, "snooker</w>": 25657, "collu": 25658, "flown</w>": 25659, "thub</w>": 25660, "ridiculously</w>": 25661, "kish": 25662, "ople</w>": 25663, "dire</w>": 25664, "asser": 25665, "aristo": 25666, "sciss": 25667, "hating</w>": 25668, "trouble": 25669, "sylvia</w>": 25670, "succul": 25671, "plots</w>": 25672, "sincerely</w>": 25673, "aler": 25674, "laureate</w>": 25675, "brack": 25676, "attn</w>": 25677, "rifles</w>": 25678, "meto": 25679, "collectible</w>": 25680, "cuomo</w>": 25681, "contestant</w>": 25682, "consistency</w>": 25683, "antz</w>": 25684, "ranges</w>": 25685, "abigail</w>": 25686, "deb</w>": 25687, "minister": 25688, "growers</w>": 25689, "anoo": 25690, "hoover</w>": 25691, "dreamer</w>": 25692, "nucle": 25693, "research": 25694, "miy": 25695, "shahid": 25696, "mav": 25697, "dhoni</w>": 25698, "cini</w>": 25699, "doj</w>": 25700, "hindus</w>": 25701, "partying</w>": 25702, "dali</w>": 25703, "alonso</w>": 25704, "informal</w>": 25705, "clarkson</w>": 25706, "itton</w>": 25707, "kian</w>": 25708, "cityo": 25709, "mori": 25710, "lasted</w>": 25711, "aspen</w>": 25712, "library": 25713, "suspici": 25714, "quat": 25715, "denial</w>": 25716, "folder</w>": 25717, "chori": 25718, "sweeping</w>": 25719, "enix</w>": 25720, "ðŁįĤ</w>": 25721, "ØŃ": 25722, "nascar": 25723, "handmadehour</w>": 25724, "moul": 25725, "heatwave</w>": 25726, "emer</w>": 25727, "examine</w>": 25728, "ibn</w>": 25729, "grind": 25730, "pov</w>": 25731, "tionist</w>": 25732, "mbo": 25733, "sheila</w>": 25734, "integrate</w>": 25735, "omes</w>": 25736, "takeaway</w>": 25737, "cerv": 25738, "connie</w>": 25739, "ticket": 25740, "celed</w>": 25741, "bien</w>": 25742, "visually</w>": 25743, "madagascar</w>": 25744, "sorry": 25745, "gui</w>": 25746, "parkrun</w>": 25747, "traits</w>": 25748, "labe": 25749, "poisoning</w>": 25750, "à¥Ģ</w>": 25751, "viable</w>": 25752, "bohemian</w>": 25753, "dentistry</w>": 25754, "bados</w>": 25755, "sprouts</w>": 25756, "masked</w>": 25757, "teddy": 25758, "ðŁĺ·</w>": 25759, "saf</w>": 25760, "saas</w>": 25761, "jiang</w>": 25762, "tight": 25763, "speaker": 25764, "withdrawal</w>": 25765, "bcn</w>": 25766, "assigned</w>": 25767, "classrooms</w>": 25768, "fleming</w>": 25769, "ðŁĴ«": 25770, "supergirl</w>": 25771, "totals</w>": 25772, "tabletop</w>": 25773, "ebooks</w>": 25774, "horizontal</w>": 25775, "craz": 25776, "flush</w>": 25777, "jard": 25778, "cdc</w>": 25779, "erson": 25780, "ãħł</w>": 25781, "greenwood</w>": 25782, "nih</w>": 25783, "cox": 25784, "ada": 25785, "litre</w>": 25786, "going": 25787, "vicky</w>": 25788, "curved</w>": 25789, "louie</w>": 25790, "grains</w>": 25791, "hye</w>": 25792, "longe": 25793, "remedy</w>": 25794, "trainee</w>": 25795, "sanjay</w>": 25796, "superstars</w>": 25797, "maser": 25798, "manu</w>": 25799, "sage": 25800, "whl</w>": 25801, "ðŁĺĤðŁĺŃ</w>": 25802, "ðŁĳįðŁı»": 25803, "msd": 25804, "enz": 25805, "rabhu</w>": 25806, "joo": 25807, "ghu": 25808, "acer</w>": 25809, "epo": 25810, "resurrection</w>": 25811, "justicefor": 25812, "blended</w>": 25813, "moda</w>": 25814, "avalanche</w>": 25815, "francesco</w>": 25816, "respective</w>": 25817, "gs": 25818, "yeast</w>": 25819, "welch</w>": 25820, "devotion</w>": 25821, "getin": 25822, "atheism</w>": 25823, "amic": 25824, "carolyn</w>": 25825, "loc</w>": 25826, "ldnont</w>": 25827, "avec</w>": 25828, "usda</w>": 25829, "legged</w>": 25830, "bravery</w>": 25831, "blower</w>": 25832, "cowboy": 25833, "heh</w>": 25834, "stible</w>": 25835, "buffal": 25836, "channel": 25837, "runchat</w>": 25838, "âĺķï¸ı": 25839, "ideology</w>": 25840, "bestseller</w>": 25841, "yoo": 25842, "peanu": 25843, "bonne</w>": 25844, "felic": 25845, "edison</w>": 25846, "fractu": 25847, "narendra</w>": 25848, "ppets</w>": 25849, "seymour</w>": 25850, "riviera</w>": 25851, "hector</w>": 25852, "necessarily</w>": 25853, "bianca</w>": 25854, "societies</w>": 25855, "thebest</w>": 25856, "wg</w>": 25857, "sentences</w>": 25858, "wink</w>": 25859, "vaccines</w>": 25860, "palooza</w>": 25861, "jamming</w>": 25862, "asf</w>": 25863, "mpus</w>": 25864, "agreements</w>": 25865, "eck": 25866, "bac</w>": 25867, "honore": 25868, "compul": 25869, "wildcat</w>": 25870, "imposed</w>": 25871, "yoga": 25872, "hudson": 25873, "canceled</w>": 25874, "lich</w>": 25875, "fuzzy</w>": 25876, "esque</w>": 25877, "chuk": 25878, "wvu</w>": 25879, "sek</w>": 25880, "flipping</w>": 25881, "rhon": 25882, "wished</w>": 25883, "wha</w>": 25884, "capability</w>": 25885, "lenovo</w>": 25886, "ìĨĮëħĦëĭ": 25887, "vivo</w>": 25888, "tvd</w>": 25889, "nora</w>": 25890, "silk": 25891, "pasadena</w>": 25892, "yosemite</w>": 25893, "valuation</w>": 25894, "clocks</w>": 25895, "uber": 25896, "mrc</w>": 25897, "darkest</w>": 25898, "aubre": 25899, "sso": 25900, "belly": 25901, "wrestlers</w>": 25902, "killin</w>": 25903, "louder</w>": 25904, "buckley</w>": 25905, "geel": 25906, "adon": 25907, "uns</w>": 25908, "appealing</w>": 25909, "ðŁĳ¯": 25910, "semitism</w>": 25911, "listens</w>": 25912, "fitz</w>": 25913, "ãĥ³ãĥ": 25914, "nylon</w>": 25915, "arty</w>": 25916, "seemingly</w>": 25917, "hala</w>": 25918, "suited</w>": 25919, "ety": 25920, "sheds</w>": 25921, "muffins</w>": 25922, "apric": 25923, "uments</w>": 25924, "uta</w>": 25925, "jammu</w>": 25926, "chelseafc</w>": 25927, "starz</w>": 25928, "yoko": 25929, "root": 25930, "cleansing</w>": 25931, "diar": 25932, "pioneering</w>": 25933, "iheartradio</w>": 25934, "digiti": 25935, "findyour": 25936, "cano</w>": 25937, "ðŁĴİ": 25938, "zol": 25939, "spacecraft</w>": 25940, "sixers</w>": 25941, "moisturi": 25942, "bile</w>": 25943, "tists</w>": 25944, "horton</w>": 25945, "ranging</w>": 25946, "columbi": 25947, "meteoro": 25948, "sentiment</w>": 25949, "epl</w>": 25950, "footh": 25951, "textbook</w>": 25952, "drainage</w>": 25953, "rly</w>": 25954, "scue</w>": 25955, "imrankhan": 25956, "ðŁĴ¸</w>": 25957, "margarita</w>": 25958, "eddy</w>": 25959, "predicts</w>": 25960, "gamergate</w>": 25961, "advise</w>": 25962, "growthhacking</w>": 25963, "loveyou</w>": 25964, "ugand": 25965, "vf</w>": 25966, "benghazi</w>": 25967, "slater</w>": 25968, "newor": 25969, "chel</w>": 25970, "independenceday</w>": 25971, "pnp</w>": 25972, "cullen</w>": 25973, "hoodies</w>": 25974, "numbered</w>": 25975, "britt</w>": 25976, "tsa</w>": 25977, "kltu</w>": 25978, "sages</w>": 25979, "momo</w>": 25980, "oneplus</w>": 25981, "coll": 25982, "guts</w>": 25983, "wta</w>": 25984, "mesmeri": 25985, "enhancing</w>": 25986, "chiroprac": 25987, "jis": 25988, "teenagers</w>": 25989, "mone</w>": 25990, "constellation</w>": 25991, "sweepstakes</w>": 25992, "eze": 25993, "slovakia</w>": 25994, "laye": 25995, "pearce</w>": 25996, "waver": 25997, "pogba</w>": 25998, "kron": 25999, "surgeons</w>": 26000, "marx</w>": 26001, "tid": 26002, "gga</w>": 26003, "descend": 26004, "pours</w>": 26005, "uprising</w>": 26006, "walla": 26007, "sabbath</w>": 26008, "bachelore": 26009, "mackin": 26010, "kam</w>": 26011, "peterborough</w>": 26012, "hora</w>": 26013, "ðŁĮŁðŁĮŁ": 26014, "thinkbig": 26015, "rj": 26016, "hydrau": 26017, "spal": 26018, "universit": 26019, "ðŁıī</w>": 26020, "mailonline</w>": 26021, "leagueof": 26022, "tenants</w>": 26023, "wally</w>": 26024, "lance": 26025, "heavens</w>": 26026, "ddr</w>": 26027, "bolts</w>": 26028, "amir": 26029, "iphone": 26030, "cigar": 26031, "endu": 26032, "rei</w>": 26033, "elabor": 26034, "ringing</w>": 26035, "johnson": 26036, "characteristics</w>": 26037, "saloon</w>": 26038, "algorithms</w>": 26039, "talkin</w>": 26040, "mtn": 26041, "dive": 26042, "regionals</w>": 26043, "ffice</w>": 26044, "hati</w>": 26045, "deviantart</w>": 26046, "sotto</w>": 26047, "shiro</w>": 26048, "lama</w>": 26049, "kwe": 26050, "faded</w>": 26051, "porting</w>": 26052, "tummy</w>": 26053, "estates</w>": 26054, "buenos</w>": 26055, "ðŁ¦ģ</w>": 26056, "believer</w>": 26057, "penetr": 26058, "darn</w>": 26059, "spite</w>": 26060, "canopy</w>": 26061, "fashioni": 26062, "tilla</w>": 26063, "petals</w>": 26064, "elijah</w>": 26065, "brawl</w>": 26066, "martyr</w>": 26067, "ë°©íĥĦìĨĮëħĦëĭ": 26068, "midtown</w>": 26069, "erich</w>": 26070, "dapper</w>": 26071, "smtown</w>": 26072, "megam": 26073, "www": 26074, "lele</w>": 26075, "ons": 26076, "catfish</w>": 26077, "firth</w>": 26078, "fossilfriday</w>": 26079, "ballpark</w>": 26080, "thaw": 26081, "potent</w>": 26082, "illie</w>": 26083, "creep</w>": 26084, "carp</w>": 26085, "soap": 26086, "gundam</w>": 26087, "infec": 26088, "yyyyy</w>": 26089, "à¤¨</w>": 26090, "zag": 26091, "ritt</w>": 26092, "calculator</w>": 26093, "boca</w>": 26094, "oko</w>": 26095, "toad</w>": 26096, "threaten</w>": 26097, "refined</w>": 26098, "olympic": 26099, "accomplishment</w>": 26100, "bacterial</w>": 26101, "aji": 26102, "tatum</w>": 26103, "feliz": 26104, "sheed</w>": 26105, "jat": 26106, "thic": 26107, "jamal</w>": 26108, "ðĿĺ": 26109, "lina</w>": 26110, "ðŁĲ¯</w>": 26111, "joking</w>": 26112, "yotpo</w>": 26113, "pinch</w>": 26114, "akron</w>": 26115, "herb": 26116, "motivation": 26117, "lia": 26118, "hostage</w>": 26119, "creek": 26120, "gamble</w>": 26121, "russell": 26122, "patti</w>": 26123, "fotos</w>": 26124, "cpc</w>": 26125, "broken": 26126, "backthe": 26127, "clays</w>": 26128, "umm": 26129, "stockton</w>": 26130, "maternal</w>": 26131, "Ã¼r": 26132, "lakel": 26133, "century": 26134, "bek</w>": 26135, "infected</w>": 26136, "à¸¡": 26137, "smackdown</w>": 26138, "manned</w>": 26139, "tahoe</w>": 26140, "smes</w>": 26141, "basa</w>": 26142, "sula</w>": 26143, "augusta</w>": 26144, ".*</w>": 26145, "rohingya</w>": 26146, "greed</w>": 26147, "counselor</w>": 26148, "silhouette</w>": 26149, "gravit": 26150, "clause</w>": 26151, "'-</w>": 26152, "bobc": 26153, "occasions</w>": 26154, "nowadays</w>": 26155, "dictat": 26156, "beard": 26157, "nally</w>": 26158, "brightest</w>": 26159, "kabul</w>": 26160, "incindia</w>": 26161, "dhanush": 26162, "archaeological</w>": 26163, "cheape": 26164, "mizzou</w>": 26165, "dhi</w>": 26166, "ovski</w>": 26167, "baxter</w>": 26168, "assemble</w>": 26169, "Ã¢": 26170, "gigi</w>": 26171, "acam": 26172, "wisely</w>": 26173, "hazard": 26174, "northampton</w>": 26175, "âľĪï¸ı": 26176, "meth</w>": 26177, "blasting</w>": 26178, "reunite</w>": 26179, "mulus</w>": 26180, "alizes</w>": 26181, "tread": 26182, "mila</w>": 26183, "edward": 26184, "kova</w>": 26185, "pesto</w>": 26186, "ðŁĳ¶": 26187, "vitz</w>": 26188, "hydraulic</w>": 26189, "refurbished</w>": 26190, "motel</w>": 26191, "isabella</w>": 26192, "homme</w>": 26193, "severance</w>": 26194, "uphol": 26195, "miserable</w>": 26196, "fari": 26197, "latter</w>": 26198, "efer</w>": 26199, "crackers</w>": 26200, "esl</w>": 26201, "acio</w>": 26202, "yyj</w>": 26203, "inan</w>": 26204, "ecb</w>": 26205, "zind": 26206, "panas": 26207, "trucking</w>": 26208, "reed": 26209, "shaker</w>": 26210, "burgess</w>": 26211, "empire": 26212, "agnes</w>": 26213, "nington</w>": 26214, "artworks</w>": 26215, "frs</w>": 26216, "tile": 26217, "biome": 26218, "eun</w>": 26219, "chong</w>": 26220, "americana</w>": 26221, "godfather</w>": 26222, "goblin</w>": 26223, "ishi": 26224, "!).</w>": 26225, "tempted</w>": 26226, "genomics</w>": 26227, "mandate</w>": 26228, "cky": 26229, "ðŁĴĻðŁĴĽ</w>": 26230, "somali</w>": 26231, "brandy</w>": 26232, "inven": 26233, "spokesperson</w>": 26234, "pcb</w>": 26235, "yuan</w>": 26236, "hg</w>": 26237, "faz": 26238, "starwars": 26239, "rowan</w>": 26240, "bluegrass</w>": 26241, "dong": 26242, "dday</w>": 26243, "trinidad</w>": 26244, "erton</w>": 26245, "banning</w>": 26246, "retention</w>": 26247, "cured</w>": 26248, "toberfest</w>": 26249, "reset</w>": 26250, "weis": 26251, "detached</w>": 26252, "behindthescenes</w>": 26253, "immunity</w>": 26254, "pha</w>": 26255, "bray": 26256, "ðŁĳ½</w>": 26257, "rancho</w>": 26258, "ramsay</w>": 26259, "estonia</w>": 26260, "ndtv</w>": 26261, "].</w>": 26262, "cabaret</w>": 26263, "taro</w>": 26264, "dv</w>": 26265, "showcases</w>": 26266, "plum": 26267, "ðŁĳ¸": 26268, "sonoma</w>": 26269, "prepa": 26270, "memorab": 26271, "estu": 26272, "driveway</w>": 26273, "ules</w>": 26274, "magnus</w>": 26275, "xr</w>": 26276, "nnn</w>": 26277, "muchas</w>": 26278, "enge": 26279, "streamed</w>": 26280, "forestry</w>": 26281, "audiobook</w>": 26282, "troy": 26283, "reckless</w>": 26284, "kilom": 26285, "ruler</w>": 26286, "rak</w>": 26287, "procession</w>": 26288, "ions</w>": 26289, "poole</w>": 26290, "noctur": 26291, "whs</w>": 26292, "farmhouse</w>": 26293, "pera</w>": 26294, "parme": 26295, "hypocrisy</w>": 26296, "sics</w>": 26297, "vant": 26298, "cask</w>": 26299, "holistic</w>": 26300, "aust": 26301, "Ð¿": 26302, "indo": 26303, "ðŁĳ©âĢį": 26304, "diso": 26305, "dispatch</w>": 26306, "olsen</w>": 26307, "makeit": 26308, "ennis</w>": 26309, "centre": 26310, "arrange</w>": 26311, "ðŁĮ¼</w>": 26312, "salted</w>": 26313, "easiest</w>": 26314, "fate": 26315, "regatta</w>": 26316, "mozz": 26317, "acan</w>": 26318, "sini</w>": 26319, "gically</w>": 26320, "chops</w>": 26321, "chicken": 26322, "workin</w>": 26323, "hagg": 26324, "involve</w>": 26325, "weeds</w>": 26326, "bookday</w>": 26327, "wakeup": 26328, "kyr": 26329, "michelin</w>": 26330, "fuss</w>": 26331, "rejuven": 26332, "vacancies</w>": 26333, "incarcer": 26334, "mst</w>": 26335, "scents</w>": 26336, "sovereign</w>": 26337, "kicker</w>": 26338, "à§": 26339, "bod</w>": 26340, "âĢĶ></w>": 26341, "sah</w>": 26342, "mobil": 26343, "shropshire</w>": 26344, "ophone</w>": 26345, "dresser</w>": 26346, "missuni": 26347, "hepburn</w>": 26348, "imo": 26349, "foliage</w>": 26350, "diagnostic</w>": 26351, "assan": 26352, "cycling": 26353, "guilt</w>": 26354, "csa</w>": 26355, "puertorico</w>": 26356, "winelover</w>": 26357, "wakefield</w>": 26358, "doggy</w>": 26359, "khe": 26360, "papp": 26361, "cog": 26362, "allot": 26363, "cuck": 26364, "poetic</w>": 26365, "mio</w>": 26366, "revit": 26367, "magician</w>": 26368, "ç¥": 26369, "antenna</w>": 26370, "westwood</w>": 26371, "mberg</w>": 26372, "luxe</w>": 26373, "oatmeal</w>": 26374, "Ø¬": 26375, "teat": 26376, "ffee</w>": 26377, "searches</w>": 26378, "lly</w>": 26379, "pluto</w>": 26380, "elon": 26381, "lettering</w>": 26382, "innocence</w>": 26383, "fai</w>": 26384, "annon</w>": 26385, "telangana</w>": 26386, "mait": 26387, "neural</w>": 26388, "canni": 26389, "aroma</w>": 26390, "astor": 26391, "fex</w>": 26392, "cocac": 26393, "monetary</w>": 26394, "fent": 26395, "unsure</w>": 26396, "'@</w>": 26397, "indirec": 26398, "tehran</w>": 26399, "isolation</w>": 26400, "libs</w>": 26401, "makeup": 26402, "mercedes": 26403, "ffy": 26404, "hetero": 26405, "deo": 26406, "scom</w>": 26407, "cursed</w>": 26408, "veteransday</w>": 26409, "frankenstein</w>": 26410, "shrews": 26411, "deco": 26412, "geese</w>": 26413, "leftover</w>": 26414, "hadid</w>": 26415, "variable</w>": 26416, "academics</w>": 26417, "carolin": 26418, "undergoing</w>": 26419, "variation</w>": 26420, "nah": 26421, "ssier</w>": 26422, "gamersunite</w>": 26423, "pursuing</w>": 26424, "emerged</w>": 26425, "llers</w>": 26426, "controlling</w>": 26427, "roaring</w>": 26428, "meteor": 26429, "volt</w>": 26430, "dawgs</w>": 26431, "beaver": 26432, "islife</w>": 26433, "bathrooms</w>": 26434, "acional</w>": 26435, "prevent": 26436, "lakedistrict</w>": 26437, "inals</w>": 26438, "yani</w>": 26439, "grabbing</w>": 26440, "sacks</w>": 26441, "lez</w>": 26442, "sway": 26443, "kool</w>": 26444, "times": 26445, "klopp</w>": 26446, "lade</w>": 26447, "concord</w>": 26448, "resulted</w>": 26449, "revive</w>": 26450, "reconciliation</w>": 26451, "oland</w>": 26452, "azz</w>": 26453, "giro</w>": 26454, "mandarin</w>": 26455, "deen": 26456, "nutritional</w>": 26457, "iscoming</w>": 26458, "vani</w>": 26459, "awwww</w>": 26460, "derived</w>": 26461, "loveyour": 26462, "stopthe": 26463, "shouting</w>": 26464, "novak</w>": 26465, "ðŁĻĮðŁı¾</w>": 26466, "loaf": 26467, "displaying</w>": 26468, "sundaywith": 26469, "maguire</w>": 26470, "cheri": 26471, "ðŁıŁ</w>": 26472, "rematch</w>": 26473, "quic": 26474, "Ú©": 26475, "yin": 26476, "ðŁĺ¹": 26477, "ilive</w>": 26478, "zip": 26479, "ourke</w>": 26480, "downloads</w>": 26481, "swat</w>": 26482, "mississ": 26483, "carers</w>": 26484, "tment</w>": 26485, "property": 26486, "hahahahahaha</w>": 26487, "gibbs</w>": 26488, "surrey": 26489, "arise</w>": 26490, "ticism</w>": 26491, "stia</w>": 26492, "irling</w>": 26493, "frog": 26494, "cose</w>": 26495, "bassist</w>": 26496, "foreig": 26497, "leau</w>": 26498, "pillows</w>": 26499, "holla</w>": 26500, "elie</w>": 26501, "disclosure</w>": 26502, "peanuts</w>": 26503, "intech</w>": 26504, "wwc</w>": 26505, "plunge</w>": 26506, "triumph": 26507, "cori": 26508, "slippers</w>": 26509, "ðŁĻıðŁĻı</w>": 26510, "neutrality</w>": 26511, "mare": 26512, "hairy</w>": 26513, "gangster</w>": 26514, "humming": 26515, "custard</w>": 26516, "merlin</w>": 26517, "alea</w>": 26518, "sby": 26519, "damp</w>": 26520, "mohan": 26521, "verbal</w>": 26522, "jst</w>": 26523, "gutted</w>": 26524, "bjor": 26525, "unfinished</w>": 26526, "ðŁĩ¯ðŁĩµ</w>": 26527, "unhappy</w>": 26528, "âļ«ï¸ı": 26529, "bypass</w>": 26530, "atsu</w>": 26531, "fischer</w>": 26532, "sav</w>": 26533, "africans</w>": 26534, "reuse</w>": 26535, "midway</w>": 26536, "demolished</w>": 26537, "gerrard</w>": 26538, "hercules</w>": 26539, "ÄŁ": 26540, "medicines</w>": 26541, "clicking</w>": 26542, "surround": 26543, "joong</w>": 26544, "waving</w>": 26545, "tribes</w>": 26546, "wetlands</w>": 26547, "officiel</w>": 26548, "arguing</w>": 26549, "lle": 26550, "dova</w>": 26551, "suzy</w>": 26552, "clubhouse</w>": 26553, "negro</w>": 26554, "obtain</w>": 26555, "gao</w>": 26556, "glance</w>": 26557, "assist": 26558, "chos</w>": 26559, "ãĤ¢": 26560, "âĺķ</w>": 26561, "adrid</w>": 26562, "occurs</w>": 26563, "stans</w>": 26564, "pardon</w>": 26565, "liveli": 26566, "employed</w>": 26567, "revisit</w>": 26568, "ffxiv</w>": 26569, "bble": 26570, "nearing</w>": 26571, "miner</w>": 26572, "ðŁĺ¹</w>": 26573, "giovanni</w>": 26574, "upto</w>": 26575, "marvell": 26576, "marse": 26577, "towels</w>": 26578, "cbn</w>": 26579, "engineered</w>": 26580, "yelling</w>": 26581, "spartan": 26582, "sians</w>": 26583, "ðŁĻĮðŁı¼": 26584, "sev": 26585, "coyote</w>": 26586, "stadi": 26587, "tcm</w>": 26588, "appen</w>": 26589, "shenanigans</w>": 26590, "openaccess</w>": 26591, "soaked</w>": 26592, "masqu": 26593, "levine</w>": 26594, "strokes</w>": 26595, "lk</w>": 26596, "apartheid</w>": 26597, "hiphop": 26598, "chardon": 26599, "maymay": 26600, "haasan</w>": 26601, "stripped</w>": 26602, "fro</w>": 26603, "scription</w>": 26604, "fton</w>": 26605, "hf": 26606, "prisons</w>": 26607, "marshal</w>": 26608, "ķãĤ": 26609, "ancho": 26610, "compromise</w>": 26611, "classification</w>": 26612, "buzzfeed</w>": 26613, "bbloggers</w>": 26614, "deserving</w>": 26615, ")/</w>": 26616, "sway</w>": 26617, "obo</w>": 26618, "campers</w>": 26619, "podernfamily</w>": 26620, "poured</w>": 26621, "brie</w>": 26622, "squirrels</w>": 26623, "seize</w>": 26624, ":#</w>": 26625, "lek": 26626, "timb": 26627, "stacy</w>": 26628, "nasdaq</w>": 26629, "repeatedly</w>": 26630, "brat</w>": 26631, "mighty": 26632, "competitor</w>": 26633, "mahone</w>": 26634, "desi</w>": 26635, "oke": 26636, "bmw": 26637, "shie</w>": 26638, "fcb": 26639, "cheapest</w>": 26640, "minimalist</w>": 26641, "paramount</w>": 26642, "nate": 26643, "haras": 26644, "insanity</w>": 26645, "lateral</w>": 26646, "mentality</w>": 26647, "mozam": 26648, "tapped</w>": 26649, "yadav</w>": 26650, "usp": 26651, "bway</w>": 26652, "theod": 26653, "bilt</w>": 26654, "raids</w>": 26655, "empress</w>": 26656, "adapted</w>": 26657, "patron": 26658, "nutshell</w>": 26659, "agra": 26660, "beaded</w>": 26661, "sundaywithmarsha</w>": 26662, "viking": 26663, "proceed": 26664, "maintained</w>": 26665, "thinkbigsundaywithmarsha</w>": 26666, "snes</w>": 26667, "musica</w>": 26668, "tower": 26669, "chab": 26670, "bok": 26671, "smt</w>": 26672, "insult</w>": 26673, "harvesting</w>": 26674, "window": 26675, "ruther": 26676, "beige</w>": 26677, "decal</w>": 26678, "indicate</w>": 26679, "mailing</w>": 26680, "rift</w>": 26681, "pole": 26682, "anderson": 26683, "choral</w>": 26684, "spride</w>": 26685, "lili": 26686, "evelyn</w>": 26687, "imrankhanpti</w>": 26688, "....\"</w>": 26689, "kered</w>": 26690, "undp</w>": 26691, "waterfalls</w>": 26692, "sears</w>": 26693, "lemans</w>": 26694, "worldseries</w>": 26695, "riel</w>": 26696, "anie": 26697, "appar": 26698, "scorers</w>": 26699, "lamp": 26700, "athan</w>": 26701, "physicians</w>": 26702, "quinoa</w>": 26703, "refusing</w>": 26704, "vuitton</w>": 26705, "unleash</w>": 26706, "sla</w>": 26707, "pati</w>": 26708, "shouts</w>": 26709, "intentions</w>": 26710, "foamed</w>": 26711, "european": 26712, "neighborhoods</w>": 26713, "meer": 26714, "manson</w>": 26715, "duh</w>": 26716, "brat": 26717, "cones</w>": 26718, "bowl": 26719, "kazakhstan</w>": 26720, "à¤¿</w>": 26721, "inappropriate</w>": 26722, "delhi": 26723, "ketchup</w>": 26724, "fulton</w>": 26725, "sys</w>": 26726, "consult</w>": 26727, "garfield</w>": 26728, "togo</w>": 26729, "fml</w>": 26730, "fled</w>": 26731, "bds</w>": 26732, "facilitate</w>": 26733, "reebok</w>": 26734, "selfie": 26735, "elevate</w>": 26736, "activate</w>": 26737, "bible": 26738, "cawx</w>": 26739, "bys</w>": 26740, "camille</w>": 26741, "syou": 26742, "skool</w>": 26743, "hert": 26744, "wbc</w>": 26745, "pledges</w>": 26746, "recorder</w>": 26747, "posh</w>": 26748, "acre": 26749, "soaking</w>": 26750, "matil": 26751, "vsco": 26752, "shootings</w>": 26753, "plar</w>": 26754, "econ": 26755, "ðŁĻĮðŁı»": 26756, "rashid</w>": 26757, "ubi": 26758, "ðŁ¤¤</w>": 26759, "swinging</w>": 26760, "wipe</w>": 26761, "raptor</w>": 26762, "msu": 26763, "musicvideo</w>": 26764, "durham": 26765, "attic</w>": 26766, "aparty</w>": 26767, "fetus</w>": 26768, "activation</w>": 26769, "aaz</w>": 26770, "motivate</w>": 26771, "ðŁĴķðŁĴķðŁĴķ</w>": 26772, "jal</w>": 26773, "à¤®</w>": 26774, "agon": 26775, "scheer</w>": 26776, "stalker</w>": 26777, "foster": 26778, "azzo</w>": 26779, "telegram</w>": 26780, "vigor": 26781, "slaugh": 26782, "screenshots</w>": 26783, "entrepreneu": 26784, "kristin</w>": 26785, "intention</w>": 26786, "chilli": 26787, "fraction</w>": 26788, "dona</w>": 26789, "gea</w>": 26790, "tcu</w>": 26791, "site": 26792, "lak</w>": 26793, "emil": 26794, "dnt</w>": 26795, "boro": 26796, "wilkinson</w>": 26797, "recu": 26798, "atoday</w>": 26799, "tanya</w>": 26800, "blanco</w>": 26801, "cdn</w>": 26802, "brilliantly</w>": 26803, "gcc</w>": 26804, "acc": 26805, "evacuated</w>": 26806, "therine": 26807, "denny</w>": 26808, "caitlin</w>": 26809, "shepard</w>": 26810, "pouch</w>": 26811, "handheld</w>": 26812, "southeastern</w>": 26813, "haa</w>": 26814, "Ã´": 26815, "resolutions</w>": 26816, "ledger</w>": 26817, "srin": 26818, "rar": 26819, "shattered</w>": 26820, "chimney</w>": 26821, "imwith": 26822, "meteor</w>": 26823, "handled</w>": 26824, "rake": 26825, "townsend</w>": 26826, "enhan": 26827, "shipy": 26828, "duct</w>": 26829, "twx</w>": 26830, "inflammatory</w>": 26831, "warhammer</w>": 26832, "theatrical</w>": 26833, "gros": 26834, "skar</w>": 26835, "scotty</w>": 26836, "niel</w>": 26837, "tito</w>": 26838, "tini</w>": 26839, "connection</w>": 26840, "_.</w>": 26841, "goldenglobes</w>": 26842, "shaq</w>": 26843, "ðŁı³ï¸ı": 26844, "hallway</w>": 26845, "fronts</w>": 26846, "effectiveness</w>": 26847, "glaston": 26848, "dhs</w>": 26849, "expi": 26850, "toh</w>": 26851, "cpl</w>": 26852, "scs</w>": 26853, "reo</w>": 26854, "hag": 26855, "resemblance</w>": 26856, "horan</w>": 26857, "abusive</w>": 26858, "quer</w>": 26859, "virtue</w>": 26860, "cholester": 26861, "aq</w>": 26862, "shane": 26863, "mce": 26864, "carriers</w>": 26865, "distress</w>": 26866, "rewind</w>": 26867, "Â¡": 26868, "voodoo</w>": 26869, "intact</w>": 26870, "anno</w>": 26871, "ðŁĺ¤": 26872, "piled</w>": 26873, "adia</w>": 26874, "ãĥ³</w>": 26875, "enow</w>": 26876, "digs</w>": 26877, "lightly</w>": 26878, "goofy</w>": 26879, "turbine</w>": 26880, "governors</w>": 26881, "conte</w>": 26882, "reopen</w>": 26883, "pah</w>": 26884, "ive": 26885, "crafting</w>": 26886, "sweeps</w>": 26887, "jodi</w>": 26888, "ande": 26889, "zucker": 26890, "kawaii</w>": 26891, "oko": 26892, "vai": 26893, "outline</w>": 26894, "kristi": 26895, "tsn</w>": 26896, "inspo</w>": 26897, "quint": 26898, "filthy</w>": 26899, "lynne</w>": 26900, "listeners</w>": 26901, "departing</w>": 26902, "ord": 26903, "tweed</w>": 26904, ",&</w>": 26905, "alek": 26906, "selfish</w>": 26907, "norther": 26908, "recognizes</w>": 26909, "ips</w>": 26910, "bes": 26911, "aed</w>": 26912, "wills</w>": 26913, "peat</w>": 26914, "surroundings</w>": 26915, "monuments</w>": 26916, "aisle</w>": 26917, "becker</w>": 26918, "lav</w>": 26919, "quantity</w>": 26920, "vah</w>": 26921, "helicopters</w>": 26922, "tucked</w>": 26923, "alvarez</w>": 26924, "shape": 26925, "obey</w>": 26926, "additi": 26927, "roadside</w>": 26928, "mite</w>": 26929, "blers</w>": 26930, "epage</w>": 26931, "jau": 26932, "ignorant</w>": 26933, "bins</w>": 26934, "lulu</w>": 26935, "xo": 26936, "cfo</w>": 26937, "eeeee</w>": 26938, "apprenticeship</w>": 26939, "sheffiel": 26940, "toi</w>": 26941, "hok": 26942, "fakenews</w>": 26943, "deploy</w>": 26944, "aidan</w>": 26945, "huskers</w>": 26946, "ãĢİ</w>": 26947, "westbrook</w>": 26948, "mister": 26949, "configur": 26950, "carr": 26951, "fica</w>": 26952, "proceedings</w>": 26953, "haw</w>": 26954, "steak": 26955, "murderer</w>": 26956, "payday</w>": 26957, "ajo</w>": 26958, "pvc</w>": 26959, "donates</w>": 26960, "biaf": 26961, "nomnom</w>": 26962, "beit</w>": 26963, "kali</w>": 26964, "xrp</w>": 26965, "ahmedabad</w>": 26966, "semic": 26967, "chey": 26968, "xtra</w>": 26969, "antwer": 26970, "headlining</w>": 26971, "squares</w>": 26972, "rounded</w>": 26973, "fluore": 26974, "bold": 26975, "disasters</w>": 26976, "amoo</w>": 26977, "generic</w>": 26978, "cranes</w>": 26979, "briefly</w>": 26980, "gig": 26981, "austerity</w>": 26982, "anticipation</w>": 26983, "forti": 26984, "treasurer</w>": 26985, "canny</w>": 26986, "cecil": 26987, "detected</w>": 26988, "checklist</w>": 26989, "à¸§": 26990, "pamela</w>": 26991, "barbados</w>": 26992, "anfield</w>": 26993, "hearty</w>": 26994, "txlege</w>": 26995, "perenni": 26996, "arrog": 26997, "ingram</w>": 26998, "âĹı</w>": 26999, "tyne": 27000, "spoon": 27001, "ration": 27002, "amba</w>": 27003, "mbe</w>": 27004, "camel": 27005, "hhs</w>": 27006, "yorkshire": 27007, "reflective</w>": 27008, "freaks</w>": 27009, "tok</w>": 27010, "judo</w>": 27011, "particles</w>": 27012, "dubs</w>": 27013, "banjo</w>": 27014, "accreditation</w>": 27015, "proverbs</w>": 27016, "overdose</w>": 27017, "integral</w>": 27018, "guang": 27019, "mcs</w>": 27020, "supercar</w>": 27021, "afb</w>": 27022, "alvin</w>": 27023, "ails</w>": 27024, "xtre": 27025, "staging</w>": 27026, "twent": 27027, "rabbits</w>": 27028, "maro": 27029, "instem</w>": 27030, "doll": 27031, "cray</w>": 27032, "santana</w>": 27033, "bleach</w>": 27034, "minions</w>": 27035, "cheap": 27036, "mant</w>": 27037, "divers</w>": 27038, "catalonia</w>": 27039, "lois</w>": 27040, "matri": 27041, "cougar</w>": 27042, "kayak</w>": 27043, "egre": 27044, "pso": 27045, "aia</w>": 27046, "å®": 27047, "charlton</w>": 27048, "tracked</w>": 27049, "scari": 27050, "pett": 27051, "fwd</w>": 27052, "xin": 27053, "gravel</w>": 27054, "bric": 27055, "biggboss</w>": 27056, "arden</w>": 27057, "hugging</w>": 27058, "palms</w>": 27059, "stv": 27060, "limb": 27061, "themovie</w>": 27062, "handicap</w>": 27063, "rime</w>": 27064, "zai</w>": 27065, "stub": 27066, "india": 27067, "lithuania</w>": 27068, "rhyth": 27069, "pita</w>": 27070, "macedonia</w>": 27071, "highered</w>": 27072, "bridget</w>": 27073, "schwarz": 27074, "skelet": 27075, "hikes</w>": 27076, "antarctic</w>": 27077, "cps</w>": 27078, "mashup</w>": 27079, "Ð°</w>": 27080, "nell": 27081, "chandra": 27082, "heir": 27083, "anus</w>": 27084, "sheridan</w>": 27085, "mimi</w>": 27086, "museu": 27087, "becca</w>": 27088, "anir": 27089, "barrie</w>": 27090, "diocese</w>": 27091, "comparable</w>": 27092, "ðŁı³ï¸ıâĢį": 27093, "yukon</w>": 27094, "mep</w>": 27095, "hormon": 27096, "meric</w>": 27097, "alf</w>": 27098, "conquered</w>": 27099, "christchurch</w>": 27100, "ðŁĴĻðŁĴĻ</w>": 27101, "hazardous</w>": 27102, "pooh</w>": 27103, "conting": 27104, "retrospective</w>": 27105, "parame": 27106, "nair</w>": 27107, "consor": 27108, "hotra</w>": 27109, "astonishing</w>": 27110, "caterpillar</w>": 27111, "uman</w>": 27112, "tism</w>": 27113, "tvs</w>": 27114, "servic": 27115, "croydon</w>": 27116, "morales</w>": 27117, "cg": 27118, "cum</w>": 27119, "teur</w>": 27120, "scanada</w>": 27121, "sall": 27122, "magnolia</w>": 27123, "elise</w>": 27124, "thour</w>": 27125, "à®¿</w>": 27126, "agomez</w>": 27127, "phelps</w>": 27128, "ë°©íĥĦìĨĮëħĦëĭ¨</w>": 27129, "whos</w>": 27130, "weaving</w>": 27131, "sisd</w>": 27132, "proposes</w>": 27133, "crows</w>": 27134, "presale</w>": 27135, "economies</w>": 27136, "bernardo": 27137, "shahid</w>": 27138, "airshow</w>": 27139, "mccann</w>": 27140, "horticul": 27141, "nrl": 27142, "duel</w>": 27143, "mongolia</w>": 27144, "toulou": 27145, "requirement</w>": 27146, "structured</w>": 27147, "edi</w>": 27148, "olives</w>": 27149, "hea": 27150, "cuter</w>": 27151, "Ðº": 27152, "enthusiast</w>": 27153, "harriet</w>": 27154, "dominion</w>": 27155, "submer": 27156, "ðŁįĥ": 27157, "saab</w>": 27158, "nesburg</w>": 27159, "moff": 27160, "defended</w>": 27161, "burt</w>": 27162, "rewarded</w>": 27163, "goldman</w>": 27164, "optics</w>": 27165, "khalid</w>": 27166, "households</w>": 27167, "buckets</w>": 27168, "cecil</w>": 27169, "chess": 27170, "substantial</w>": 27171, "efl</w>": 27172, "operation": 27173, "evaluate</w>": 27174, "stn</w>": 27175, "recession</w>": 27176, "lll</w>": 27177, "tomas</w>": 27178, "truths</w>": 27179, "akbar</w>": 27180, "swords</w>": 27181, "pact</w>": 27182, "embarrass": 27183, "hao</w>": 27184, "ayurve": 27185, "scripture</w>": 27186, "nycc</w>": 27187, "opt</w>": 27188, "diameter</w>": 27189, "scented</w>": 27190, "organizers</w>": 27191, "relat": 27192, "hae</w>": 27193, "dreamers</w>": 27194, "dese": 27195, "ðŁĮ»": 27196, "restricted</w>": 27197, "nale</w>": 27198, "rhp</w>": 27199, "dolan</w>": 27200, "munster</w>": 27201, "haired</w>": 27202, "consultants</w>": 27203, "joints</w>": 27204, "humil": 27205, "dill</w>": 27206, "relentless</w>": 27207, "tÃ©": 27208, "afil": 27209, "utilities</w>": 27210, "japanese": 27211, "condemn</w>": 27212, "petite</w>": 27213, "collide</w>": 27214, "qf</w>": 27215, "peaches</w>": 27216, "courier</w>": 27217, "lore</w>": 27218, "âĺİï¸ı</w>": 27219, "reliability</w>": 27220, "chuk</w>": 27221, "ðŁĻĥ": 27222, "stures</w>": 27223, "gether</w>": 27224, "hostel</w>": 27225, "bier": 27226, "-_-</w>": 27227, "âĩ": 27228, "eze</w>": 27229, "tailo": 27230, "dient</w>": 27231, "bluff</w>": 27232, "chuffed</w>": 27233, "pilip": 27234, "monarch": 27235, "eem</w>": 27236, "buchan": 27237, "bick": 27238, "opau": 27239, "kups</w>": 27240, "à¸¢": 27241, "pistons</w>": 27242, "spins</w>": 27243, "mand</w>": 27244, "cest</w>": 27245, "burne</w>": 27246, "vile</w>": 27247, "cherries</w>": 27248, "beckett</w>": 27249, "needles</w>": 27250, "panch": 27251, "ëĤ": 27252, "hahah": 27253, "troubles</w>": 27254, "insists</w>": 27255, "doyou": 27256, "gmc</w>": 27257, "mortar</w>": 27258, "delegate</w>": 27259, "inn": 27260, "ganda</w>": 27261, "sinatra</w>": 27262, "à¤¤</w>": 27263, "speeding</w>": 27264, "pupil</w>": 27265, "premises</w>": 27266, "alignment</w>": 27267, "pikach": 27268, "asus</w>": 27269, "jalan</w>": 27270, "Øµ": 27271, "limestone</w>": 27272, "folkl": 27273, "parmesan</w>": 27274, "ceil": 27275, "moy": 27276, "shawnmendes</w>": 27277, "acup</w>": 27278, "hust": 27279, "otes</w>": 27280, "medina</w>": 27281, "madi</w>": 27282, "gtav</w>": 27283, "censorship</w>": 27284, "arg</w>": 27285, "sweeney</w>": 27286, "sykes</w>": 27287, "colo</w>": 27288, "footsteps</w>": 27289, "canned</w>": 27290, "advance": 27291, "gtaonline</w>": 27292, "healthyliving</w>": 27293, "ðŁį¾": 27294, "aig": 27295, "pality</w>": 27296, "ocs</w>": 27297, "hebrew</w>": 27298, "imminent</w>": 27299, "berkshire</w>": 27300, "jeremiah</w>": 27301, "outgoing</w>": 27302, "baker": 27303, "entrata</w>": 27304, "maids</w>": 27305, "groves</w>": 27306, "boc": 27307, "adel</w>": 27308, "mfw</w>": 27309, "conscience</w>": 27310, "armys</w>": 27311, "nutella</w>": 27312, "contestalert</w>": 27313, "novelist</w>": 27314, "lah": 27315, "banker</w>": 27316, "marquez</w>": 27317, "ðŁı¡</w>": 27318, "toff</w>": 27319, "outage</w>": 27320, "grp</w>": 27321, "ðŁĺŃðŁĺŃðŁĺŃðŁĺŃ</w>": 27322, "muscle": 27323, "dudley</w>": 27324, "nvidia</w>": 27325, "midi</w>": 27326, "muni": 27327, "essays</w>": 27328, "datac": 27329, "carter": 27330, "à¸£</w>": 27331, "tans</w>": 27332, "ives</w>": 27333, "publications</w>": 27334, "aler</w>": 27335, "okwx</w>": 27336, "ilu": 27337, "cutt": 27338, "harp</w>": 27339, "outlaw</w>": 27340, "lutheran</w>": 27341, "brill": 27342, "bolic</w>": 27343, "dowell</w>": 27344, "greenland</w>": 27345, "besties</w>": 27346, "pathi</w>": 27347, "payton</w>": 27348, "guest": 27349, "harden</w>": 27350, "ðŁ¤©": 27351, "anned</w>": 27352, "evacuation</w>": 27353, "poised</w>": 27354, "mcder": 27355, "bhan": 27356, "oi": 27357, "envelope</w>": 27358, "cid</w>": 27359, "cavi": 27360, "tapas</w>": 27361, "bookreview</w>": 27362, "greyhound</w>": 27363, "âĻª": 27364, "feud</w>": 27365, "lungs</w>": 27366, "forte</w>": 27367, "raider": 27368, "ffer": 27369, "onix</w>": 27370, "depend</w>": 27371, "ynwa</w>": 27372, "relating</w>": 27373, "devs</w>": 27374, "ðŁĴĲ": 27375, "acquires</w>": 27376, "dha</w>": 27377, "jyo": 27378, "privati": 27379, "canine</w>": 27380, "kb": 27381, "crab": 27382, "sardin": 27383, "imagining</w>": 27384, "kj": 27385, "empor": 27386, "downhill</w>": 27387, "nez</w>": 27388, "taeyeon</w>": 27389, "nickimin": 27390, "gbp</w>": 27391, "àµ": 27392, "wap": 27393, "secco</w>": 27394, "mashed</w>": 27395, "ðŁĴ¥ðŁĴ¥": 27396, "augustine</w>": 27397, "dissol": 27398, "dictator</w>": 27399, "âĵ": 27400, "viper</w>": 27401, "edfringe</w>": 27402, "vaux": 27403, "hardwork</w>": 27404, "booklet</w>": 27405, "nox</w>": 27406, "chiff": 27407, "ðŁĴ¨": 27408, "observations</w>": 27409, "xboxone</w>": 27410, "usher</w>": 27411, "keer": 27412, "lup</w>": 27413, "dallas": 27414, "calgary": 27415, "madra": 27416, "dious</w>": 27417, "kbs</w>": 27418, "woodward</w>": 27419, "heroine</w>": 27420, "lumber": 27421, "seaworld</w>": 27422, "ows</w>": 27423, "mcke": 27424, "maverick</w>": 27425, "gula</w>": 27426, "crossroads</w>": 27427, "fang</w>": 27428, "sade</w>": 27429, "nikol": 27430, "cheetah</w>": 27431, "mec</w>": 27432, "ppg</w>": 27433, "erick": 27434, "ðŁİµ": 27435, "toxic": 27436, "bjj</w>": 27437, "viola</w>": 27438, "spire</w>": 27439, "chino</w>": 27440, "travis": 27441, "institutional</w>": 27442, "haas</w>": 27443, "lowry</w>": 27444, "wac": 27445, "eae</w>": 27446, "humid</w>": 27447, "mpton</w>": 27448, "ruck": 27449, "jew</w>": 27450, "cine</w>": 27451, "zimmer": 27452, "sef</w>": 27453, "bharat": 27454, "frees": 27455, "aamir</w>": 27456, "ðŁĴħ": 27457, "zinc</w>": 27458, "wane</w>": 27459, "multiplayer</w>": 27460, "royalwedding</w>": 27461, "eel</w>": 27462, "precipit": 27463, "query</w>": 27464, "kimberly</w>": 27465, "isabel</w>": 27466, "fulfill</w>": 27467, "igan</w>": 27468, "vaul": 27469, "pane</w>": 27470, "scy": 27471, "digit</w>": 27472, "gunn</w>": 27473, "utah": 27474, "dogday</w>": 27475, "fion": 27476, "xiaomi</w>": 27477, "dac</w>": 27478, "elast": 27479, "chavez</w>": 27480, "roblo": 27481, "gine": 27482, "tenth</w>": 27483, "abh": 27484, "keto": 27485, "hurdle</w>": 27486, "nadia</w>": 27487, "memorabilia</w>": 27488, "habs</w>": 27489, "quan</w>": 27490, "hw": 27491, "hvac</w>": 27492, "pixar</w>": 27493, "eccle": 27494, "kramer</w>": 27495, "accuses</w>": 27496, "ðŁĴļðŁĴļ": 27497, "perse": 27498, "meantime</w>": 27499, "wahl": 27500, "atletico</w>": 27501, "âĢ¢âĢ¢âĢ¢âĢ¢": 27502, "ottoman</w>": 27503, "novo": 27504, "kus</w>": 27505, "connected</w>": 27506, "trusts</w>": 27507, "dmv</w>": 27508, "spencer": 27509, "rahulg": 27510, "dove": 27511, "stokes</w>": 27512, "bologna</w>": 27513, "enthusiasts</w>": 27514, "Ãª": 27515, "rockstargames</w>": 27516, "tedcruz</w>": 27517, "duras</w>": 27518, "sacked</w>": 27519, "latex</w>": 27520, "immersive</w>": 27521, "cert</w>": 27522, "lucin": 27523, "principals</w>": 27524, "fares</w>": 27525, "sails</w>": 27526, "farn": 27527, "ament</w>": 27528, "saffron</w>": 27529, "quentin</w>": 27530, "checkpoint</w>": 27531, "ferris</w>": 27532, "excur": 27533, "ðŁĳīðŁı¼</w>": 27534, "bailey": 27535, "seh": 27536, "terre</w>": 27537, "madam</w>": 27538, "sband</w>": 27539, "wanderers</w>": 27540, "cumberbatch</w>": 27541, "yyc": 27542, "digitally</w>": 27543, "blackandwhitephotography</w>": 27544, "rollin</w>": 27545, "moroccan</w>": 27546, "ðŁĮħ</w>": 27547, "dinner": 27548, "dwell": 27549, "toom": 27550, "mye": 27551, "ezra</w>": 27552, "cpfc</w>": 27553, "warhol</w>": 27554, "meer</w>": 27555, "jonah</w>": 27556, "noaa</w>": 27557, "sgate</w>": 27558, "soon": 27559, "secular</w>": 27560, "gating</w>": 27561, "tio</w>": 27562, "driver": 27563, "sissy</w>": 27564, "assange</w>": 27565, "tath": 27566, "edmund</w>": 27567, "bobcats</w>": 27568, "raji": 27569, "postage</w>": 27570, "studs</w>": 27571, "mgm</w>": 27572, "kato</w>": 27573, "edinburgh": 27574, "meetthe": 27575, "shirt": 27576, "faa</w>": 27577, "mensfashion</w>": 27578, "spreads</w>": 27579, "wim</w>": 27580, "carts</w>": 27581, "phoebe</w>": 27582, "jars</w>": 27583, "botswana</w>": 27584, "ÙĤ": 27585, "edwar": 27586, "skar": 27587, "rive": 27588, "gusty</w>": 27589, "ctv</w>": 27590, "ferdinand</w>": 27591, "sutherland</w>": 27592, "nickiminaj</w>": 27593, "kv": 27594, "sius</w>": 27595, "beech</w>": 27596, "rez": 27597, "desires</w>": 27598, "onial</w>": 27599, "campo</w>": 27600, "quarry</w>": 27601, "lorraine</w>": 27602, "gilmore</w>": 27603, "iggy</w>": 27604, "µï¸ı</w>": 27605, "hopping</w>": 27606, "aviz</w>": 27607, "ðŁĮº": 27608, "unisex</w>": 27609, "dedicate</w>": 27610, "attitudes</w>": 27611, "steer</w>": 27612, "junkie</w>": 27613, "railway": 27614, "yb</w>": 27615, "whisper</w>": 27616, "keyan</w>": 27617, "kus": 27618, "jug</w>": 27619, "dix</w>": 27620, "ains</w>": 27621, "summon": 27622, "ovich</w>": 27623, "syed</w>": 27624, "herald": 27625, "maison</w>": 27626, "meded</w>": 27627, "wildflower": 27628, "mainland</w>": 27629, "risky</w>": 27630, "rukh</w>": 27631, "overlooked</w>": 27632, "kic</w>": 27633, "destroys</w>": 27634, "naman</w>": 27635, "kip": 27636, "zano</w>": 27637, "championsleague</w>": 27638, "bandit</w>": 27639, "quincy</w>": 27640, "smile": 27641, "calvin": 27642, "openings</w>": 27643, "tapp": 27644, "olulu</w>": 27645, "spectro": 27646, "accredited</w>": 27647, "apk</w>": 27648, "praised</w>": 27649, "barnett</w>": 27650, "pollen</w>": 27651, "premiered</w>": 27652, "selenagomez</w>": 27653, "toured</w>": 27654, "screenings</w>": 27655, "uuu</w>": 27656, "miso": 27657, "ense</w>": 27658, "adamlambert</w>": 27659, "guelph</w>": 27660, "haryana</w>": 27661, "hutto": 27662, "lear</w>": 27663, "ltc</w>": 27664, "poached</w>": 27665, "brexit": 27666, "æĿ": 27667, "ttc</w>": 27668, "pavement</w>": 27669, "mongers</w>": 27670, "roe": 27671, "aders</w>": 27672, "lington": 27673, "participant</w>": 27674, "cared</w>": 27675, "gail</w>": 27676, "yates</w>": 27677, "lantic</w>": 27678, "dashboard</w>": 27679, "joo</w>": 27680, "felipe</w>": 27681, "ssionist</w>": 27682, "bum": 27683, "send": 27684, "aeri": 27685, "thugs</w>": 27686, "lucifer</w>": 27687, "ahe": 27688, "detector</w>": 27689, "filly</w>": 27690, "gasoline</w>": 27691, "hamper</w>": 27692, "humpday</w>": 27693, "theta</w>": 27694, "theband</w>": 27695, "forecasts</w>": 27696, "ohhh</w>": 27697, "lobb": 27698, "holl": 27699, "cpu</w>": 27700, "azu": 27701, "adar": 27702, "hailey</w>": 27703, "bub": 27704, "cart": 27705, "quoted</w>": 27706, "anarchy</w>": 27707, "pancre": 27708, "twitart</w>": 27709, "alden</w>": 27710, "stash</w>": 27711, "theless</w>": 27712, "orni": 27713, "beliebers</w>": 27714, "mormon</w>": 27715, "particle</w>": 27716, "aviation": 27717, "â¬Ĩ": 27718, "webcamtoy</w>": 27719, "saddened</w>": 27720, "cruis": 27721, "hamlet</w>": 27722, "nct": 27723, "rollins</w>": 27724, "marquee</w>": 27725, "sawyer</w>": 27726, "reliance</w>": 27727, "aura</w>": 27728, "diec": 27729, "soothing</w>": 27730, "signings</w>": 27731, "akis</w>": 27732, "Ã³</w>": 27733, "atkins</w>": 27734, "aerop": 27735, "ðŁĮ¿": 27736, "yab": 27737, "shari": 27738, "connol": 27739, "dubbed</w>": 27740, "manufacture</w>": 27741, "convincing</w>": 27742, "feelthebern</w>": 27743, "rau": 27744, "pulit": 27745, "onec": 27746, "gemstone</w>": 27747, "urging</w>": 27748, "bagu": 27749, "gah</w>": 27750, "acids</w>": 27751, "fianc": 27752, "zodiac</w>": 27753, "snoop</w>": 27754, "herrera</w>": 27755, "initiated</w>": 27756, "venge": 27757, "professors</w>": 27758, "prodi": 27759, "stronger": 27760, "emission</w>": 27761, "bba</w>": 27762, "halle": 27763, "tapp</w>": 27764, "hawan</w>": 27765, "whim": 27766, "competed</w>": 27767, "myrtle</w>": 27768, "irport</w>": 27769, "coldplay</w>": 27770, "ache": 27771, "skep": 27772, "mson": 27773, "ssic": 27774, "calligraphy</w>": 27775, "swimmers</w>": 27776, "mey</w>": 27777, "ppc</w>": 27778, "thrift</w>": 27779, "poc</w>": 27780, "replaces</w>": 27781, "commuter</w>": 27782, "âģ¦âģ¦@</w>": 27783, "goers</w>": 27784, "logue</w>": 27785, "paradig": 27786, "baskets</w>": 27787, "sensitivity</w>": 27788, "johan</w>": 27789, "atlantis</w>": 27790, "&&</w>": 27791, "suitcase</w>": 27792, "anxious</w>": 27793, "lh": 27794, "stri</w>": 27795, "galloway</w>": 27796, "stread</w>": 27797, "warden</w>": 27798, "grounded</w>": 27799, "fficiency</w>": 27800, "lifeat": 27801, "relic</w>": 27802, "disguise</w>": 27803, "islanders</w>": 27804, "fcofficial</w>": 27805, "classicalmusic</w>": 27806, "bmc</w>": 27807, "enfield</w>": 27808, "bique</w>": 27809, "oakley</w>": 27810, "batman": 27811, "slaying</w>": 27812, "nerves</w>": 27813, "multit": 27814, "calcium</w>": 27815, "projector</w>": 27816, "scottsdale</w>": 27817, "antino</w>": 27818, "grips</w>": 27819, "kimmel</w>": 27820, "desmond</w>": 27821, "protestors</w>": 27822, "hiatus</w>": 27823, "metabolism</w>": 27824, "concluded</w>": 27825, "presser</w>": 27826, "tipping</w>": 27827, "slide": 27828, "eto": 27829, "hunting": 27830, "ausopen</w>": 27831, "rik</w>": 27832, "ppery</w>": 27833, "innovators</w>": 27834, "pitchers</w>": 27835, "agger": 27836, "fungi</w>": 27837, "zad</w>": 27838, "prolific</w>": 27839, "rocknroll</w>": 27840, "blames</w>": 27841, "ctar</w>": 27842, "stamford</w>": 27843, "qad": 27844, "mozzarella</w>": 27845, "insanely</w>": 27846, "denver": 27847, "phouse</w>": 27848, "nomad</w>": 27849, "ï¿": 27850, "sris": 27851, "produ": 27852, "henley</w>": 27853, "pagan</w>": 27854, "amtrak</w>": 27855, "rubi": 27856, "incl": 27857, "tutor</w>": 27858, "scotia</w>": 27859, "woes</w>": 27860, "singapo": 27861, "funnel</w>": 27862, "turnbull</w>": 27863, "knowledge": 27864, "grimm</w>": 27865, "realmadrid</w>": 27866, "weare</w>": 27867, "missiles</w>": 27868, "consol": 27869, "emojis</w>": 27870, "sneak": 27871, "smiths</w>": 27872, "ruiz</w>": 27873, "brou": 27874, "iel": 27875, "haver": 27876, "ðŁĮļ</w>": 27877, "kingof": 27878, "basilica</w>": 27879, "circulation</w>": 27880, "printers</w>": 27881, "tapping</w>": 27882, "ridley</w>": 27883, "dragged</w>": 27884, "haj": 27885, "writer": 27886, "fundamentals</w>": 27887, "personalities</w>": 27888, "metre</w>": 27889, "stereotypes</w>": 27890, "burle": 27891, "bestof": 27892, "nffc</w>": 27893, "hath": 27894, "ministries</w>": 27895, "aali": 27896, "tracing</w>": 27897, "paved</w>": 27898, "łï¸ı": 27899, "gic": 27900, "inspire": 27901, "tug</w>": 27902, "hare": 27903, "repeated</w>": 27904, "expon": 27905, "lolli": 27906, "rhode</w>": 27907, "precin": 27908, "installations</w>": 27909, "instagram": 27910, "azar</w>": 27911, "ies": 27912, "solely</w>": 27913, "dukes</w>": 27914, "missionary</w>": 27915, "vanguard</w>": 27916, "fursuitfriday</w>": 27917, "ond": 27918, "polari": 27919, "mast</w>": 27920, "haran</w>": 27921, "josÃ©</w>": 27922, "jacked</w>": 27923, "ecoun": 27924, "alities</w>": 27925, "neph": 27926, "ravel</w>": 27927, "moderated</w>": 27928, "scow": 27929, "sfb</w>": 27930, "uruguay</w>": 27931, "aso</w>": 27932, "nig": 27933, "audu": 27934, "pints</w>": 27935, "latina</w>": 27936, "benz": 27937, "mitting</w>": 27938, "charted</w>": 27939, "matology</w>": 27940, "citro": 27941, "biopic</w>": 27942, "ðŁĳŃ</w>": 27943, "djokovic</w>": 27944, "foxy</w>": 27945, "aguil": 27946, "soto</w>": 27947, "anada</w>": 27948, "sinking</w>": 27949, "scrap": 27950, "hairs</w>": 27951, "bethany</w>": 27952, "factfriday</w>": 27953, "ðŁĲĲ</w>": 27954, "unleashed</w>": 27955, ")(</w>": 27956, "contradic": 27957, "ramon</w>": 27958, "coastline</w>": 27959, "yong": 27960, "snsd</w>": 27961, "ligan</w>": 27962, "pome": 27963, "mitage</w>": 27964, "gett</w>": 27965, "wati</w>": 27966, "risk": 27967, "soaring</w>": 27968, "brush": 27969, "fpl</w>": 27970, "avan": 27971, "åĨ": 27972, "larson</w>": 27973, "shear": 27974, "multil": 27975, "blur</w>": 27976, "multimedia</w>": 27977, "chunky</w>": 27978, "pari</w>": 27979, "nani</w>": 27980, "weird": 27981, "cholesterol</w>": 27982, "charles": 27983, "dreamed</w>": 27984, "tanning</w>": 27985, "puzzles</w>": 27986, "fram": 27987, "handball</w>": 27988, "chag": 27989, "belize</w>": 27990, "alu</w>": 27991, "bangs</w>": 27992, "ÑĦ": 27993, "detectives</w>": 27994, "mcg</w>": 27995, "ishq": 27996, "bothered</w>": 27997, "safc</w>": 27998, "mping</w>": 27999, "teneri": 28000, "gays</w>": 28001, "sailor": 28002, "angi": 28003, "multicul": 28004, "guessed</w>": 28005, "rosÃ©</w>": 28006, "highways</w>": 28007, "broom</w>": 28008, "chattanoo": 28009, "-'</w>": 28010, "seeker</w>": 28011, "oned": 28012, "atf</w>": 28013, "luc</w>": 28014, "><</w>": 28015, "bari</w>": 28016, "percep": 28017, "jewelry": 28018, "asph": 28019, "sorrow</w>": 28020, "sling": 28021, "mammoth</w>": 28022, "jackie": 28023, "ë§": 28024, "wiltshire</w>": 28025, "sao</w>": 28026, "cancell": 28027, "impaired</w>": 28028, "torial</w>": 28029, "breed": 28030, "guyen</w>": 28031, "judice</w>": 28032, "title": 28033, "prospective</w>": 28034, "applicants</w>": 28035, "ðŁįĬ</w>": 28036, "episcop": 28037, "eid": 28038, "byo": 28039, "stockings</w>": 28040, "ðŁĴĥðŁĴĥ": 28041, "llp</w>": 28042, "snag</w>": 28043, "keepit": 28044, "lough</w>": 28045, "olson</w>": 28046, "maturity</w>": 28047, "!!!\"</w>": 28048, "copter</w>": 28049, "isha</w>": 28050, "bli</w>": 28051, "wilmington</w>": 28052, "tryouts</w>": 28053, "thai": 28054, "ðŁ¥³</w>": 28055, "pebble</w>": 28056, "kraft</w>": 28057, "fp": 28058, "Âº</w>": 28059, "ssively</w>": 28060, "livin</w>": 28061, "contestants</w>": 28062, "textures</w>": 28063, "joan": 28064, "hdr</w>": 28065, "filmfestival</w>": 28066, "provence</w>": 28067, "wido": 28068, "opend": 28069, "csi</w>": 28070, "stown": 28071, "croati": 28072, "adjust</w>": 28073, "hostile</w>": 28074, "analysts</w>": 28075, "ilan</w>": 28076, "cuppa</w>": 28077, "brum</w>": 28078, "newfoundland</w>": 28079, "goodwin</w>": 28080, "mett</w>": 28081, "mallorca</w>": 28082, "plugs</w>": 28083, "buk": 28084, "bbhutto": 28085, "wrestle</w>": 28086, "saire</w>": 28087, "shopped</w>": 28088, "forza</w>": 28089, "lehead</w>": 28090, "vivo": 28091, "bast": 28092, "roxy</w>": 28093, "regis</w>": 28094, "hardworking</w>": 28095, "honolulu</w>": 28096, "despair</w>": 28097, "youngsters</w>": 28098, "nig</w>": 28099, "impromp": 28100, "rolltide</w>": 28101, "deemed</w>": 28102, "treason</w>": 28103, "rushed</w>": 28104, "forged</w>": 28105, "fff</w>": 28106, "pikachu</w>": 28107, "briggs</w>": 28108, "doit</w>": 28109, "accent": 28110, "laus</w>": 28111, "glaze</w>": 28112, "competent</w>": 28113, "aho": 28114, "photog</w>": 28115, "midfield</w>": 28116, "lego": 28117, "harvard": 28118, "minorities</w>": 28119, "reilly</w>": 28120, "sliced</w>": 28121, "onceupon": 28122, "initially</w>": 28123, "financially</w>": 28124, "landscapephotography</w>": 28125, "hardro": 28126, "quo</w>": 28127, "mmers</w>": 28128, "parkinson</w>": 28129, "smugg": 28130, "readiness</w>": 28131, "brutally</w>": 28132, "gloucester": 28133, "mped</w>": 28134, "bbhuttozardari</w>": 28135, "murder": 28136, "yed": 28137, "dataviz</w>": 28138, "srt</w>": 28139, "downing</w>": 28140, "bians</w>": 28141, "mÃ¼": 28142, "fleck</w>": 28143, "flipped</w>": 28144, "sly": 28145, "brilliance</w>": 28146, "rim": 28147, "kum</w>": 28148, "bubba</w>": 28149, "koi</w>": 28150, "knitted</w>": 28151, "sorg</w>": 28152, "mais</w>": 28153, "ðŁĮ²</w>": 28154, "tiss": 28155, "sustain</w>": 28156, "sensu": 28157, "akhan</w>": 28158, "ziest</w>": 28159, "examines</w>": 28160, "chardonnay</w>": 28161, "username</w>": 28162, "shortlist</w>": 28163, "rebs</w>": 28164, "ono": 28165, "daring</w>": 28166, "hardwood</w>": 28167, "cheque</w>": 28168, "righteous</w>": 28169, "lightening</w>": 28170, "dirk</w>": 28171, "shradd": 28172, "dura": 28173, "downstairs</w>": 28174, "shal</w>": 28175, "amigos</w>": 28176, "ruff": 28177, "slaw</w>": 28178, "ries": 28179, "rednation": 28180, "manus</w>": 28181, "ðŁĩ§ðŁĩ·</w>": 28182, "distinction</w>": 28183, "ubun": 28184, "duran</w>": 28185, "migra": 28186, "thians</w>": 28187, "laver": 28188, "domestic": 28189, "kx": 28190, "jazzy</w>": 28191, "justify</w>": 28192, "belonging</w>": 28193, "insulation</w>": 28194, "colorstv</w>": 28195, "drunken</w>": 28196, "channeling</w>": 28197, "quand</w>": 28198, "xiii</w>": 28199, "enlighten": 28200, "kano</w>": 28201, "fatima</w>": 28202, "teenchoice</w>": 28203, "terrified</w>": 28204, "pba</w>": 28205, "asley</w>": 28206, "metmuseum</w>": 28207, "dune</w>": 28208, "packer</w>": 28209, "kio": 28210, "ðŁĴľðŁĴľ</w>": 28211, "boiler": 28212, "fascism</w>": 28213, "armored</w>": 28214, "backgrounds</w>": 28215, "inmates</w>": 28216, "embarrassed</w>": 28217, "defines</w>": 28218, "thd</w>": 28219, "wego</w>": 28220, "silicone</w>": 28221, "loon</w>": 28222, "elding</w>": 28223, "borrowed</w>": 28224, "hemp": 28225, "aksh": 28226, "kawasaki</w>": 28227, "bry</w>": 28228, "deaf": 28229, "killer": 28230, "disposal</w>": 28231, "ðŁĩ°": 28232, "glastonbury</w>": 28233, "uncovered</w>": 28234, "oxide</w>": 28235, "poff</w>": 28236, "dant</w>": 28237, "kj</w>": 28238, "kuro": 28239, "drizzle</w>": 28240, "peoples": 28241, "fee": 28242, "propri": 28243, "ddlovato</w>": 28244, "piggy</w>": 28245, "otis</w>": 28246, "allergies</w>": 28247, "ubis": 28248, "penguin": 28249, "sera</w>": 28250, "viz": 28251, "prosperous</w>": 28252, "icides</w>": 28253, "tornadoes</w>": 28254, "senegal</w>": 28255, "webcast</w>": 28256, "stored</w>": 28257, "enchanted</w>": 28258, "bbcone</w>": 28259, "bayarea</w>": 28260, "entrepreneurial</w>": 28261, "rednationrising</w>": 28262, "experimenting</w>": 28263, "angan</w>": 28264, "lotto</w>": 28265, "theyre</w>": 28266, "pore</w>": 28267, "erp</w>": 28268, "serene</w>": 28269, "eastwood</w>": 28270, "brokers</w>": 28271, "barge</w>": 28272, "stallion</w>": 28273, "timberlake</w>": 28274, "tailored</w>": 28275, "dystop": 28276, "bate</w>": 28277, "lators</w>": 28278, "dixit</w>": 28279, "branson</w>": 28280, "dynamo</w>": 28281, "kylie": 28282, "shameful</w>": 28283, "btwn</w>": 28284, "springtime</w>": 28285, "mixture</w>": 28286, "sounded</w>": 28287, "luton</w>": 28288, "dades</w>": 28289, "mala</w>": 28290, "opra</w>": 28291, "enic</w>": 28292, "rahulgandhi</w>": 28293, "sewer</w>": 28294, "~~~~": 28295, "kyu": 28296, "northeastern</w>": 28297, "caer": 28298, "bcu</w>": 28299, "nirvana</w>": 28300, "kitchens</w>": 28301, "ousy</w>": 28302, "alm</w>": 28303, "riverdale</w>": 28304, "hidden": 28305, "flint": 28306, "spd</w>": 28307, "patrons</w>": 28308, "katyperry</w>": 28309, "augh": 28310, "exhibitions</w>": 28311, "smc</w>": 28312, "shuts</w>": 28313, "atore</w>": 28314, "dain</w>": 28315, "something": 28316, "berth</w>": 28317, "bog</w>": 28318, "porter": 28319, "gento</w>": 28320, "concussion</w>": 28321, "anglic": 28322, "rowe</w>": 28323, "grilling</w>": 28324, "scarlett</w>": 28325, "mastering</w>": 28326, "mornin</w>": 28327, "commented</w>": 28328, "sime": 28329, "sizing</w>": 28330, "christy</w>": 28331, "ceos</w>": 28332, "stm</w>": 28333, "atry</w>": 28334, "tariffs</w>": 28335, "vacation": 28336, "prejudice</w>": 28337, "psu</w>": 28338, "parental</w>": 28339, "farage</w>": 28340, "cana</w>": 28341, "capcom</w>": 28342, "kosovo</w>": 28343, "youre": 28344, "menstru": 28345, "stalin</w>": 28346, "grapefruit</w>": 28347, "bran</w>": 28348, "chesa": 28349, "daven": 28350, "excel": 28351, "!!)</w>": 28352, "à¹Į</w>": 28353, "distributor</w>": 28354, "cea</w>": 28355, "bridesma": 28356, "millennial</w>": 28357, "wain": 28358, "observing</w>": 28359, "misery</w>": 28360, "planetary</w>": 28361, "exposing</w>": 28362, "braised</w>": 28363, "compton</w>": 28364, "dongha": 28365, "ql</w>": 28366, "springsteen</w>": 28367, "thul": 28368, "sylve": 28369, "cabo</w>": 28370, "palad": 28371, "nielsen</w>": 28372, "gazing</w>": 28373, "baja</w>": 28374, "roud</w>": 28375, "orchids</w>": 28376, "johannesburg</w>": 28377, "seman</w>": 28378, "dji</w>": 28379, "operative</w>": 28380, "affection</w>": 28381, "eclectic</w>": 28382, "atc</w>": 28383, "mutant</w>": 28384, "awx</w>": 28385, "nice": 28386, "melbourne": 28387, "indulg": 28388, "tulip</w>": 28389, "diaspora</w>": 28390, "welp</w>": 28391, "biggie</w>": 28392, "mississauga</w>": 28393, "retriever</w>": 28394, "oran</w>": 28395, "tammy</w>": 28396, "cta</w>": 28397, "hippo": 28398, "seasoned</w>": 28399, "germans</w>": 28400, "engv": 28401, "marvellous</w>": 28402, "imf</w>": 28403, "relays</w>": 28404, "montan": 28405, "mauriti": 28406, "meister</w>": 28407, "assurance</w>": 28408, "reigning</w>": 28409, "sufficient</w>": 28410, "hane</w>": 28411, "nothing": 28412, "posse</w>": 28413, "navy": 28414, "inlove</w>": 28415, "brighton": 28416, "enqu": 28417, "chung</w>": 28418, "sweaty</w>": 28419, "esc</w>": 28420, "caled": 28421, "mans": 28422, "nicaragua</w>": 28423, "slices</w>": 28424, "mocha</w>": 28425, "washingtonpost</w>": 28426, "bbn</w>": 28427, "damned</w>": 28428, "growing": 28429, "enburg</w>": 28430, "loan": 28431, "mes": 28432, "whoops</w>": 28433, "believers</w>": 28434, "spiel": 28435, "vodaf": 28436, "lat</w>": 28437, "sled</w>": 28438, "cricketer</w>": 28439, "browne</w>": 28440, "golfers</w>": 28441, "barra": 28442, "watchers</w>": 28443, "luigi</w>": 28444, "swamy</w>": 28445, "moms": 28446, "pitched</w>": 28447, "santor": 28448, "crs</w>": 28449, "sire</w>": 28450, "scamp</w>": 28451, "bode": 28452, "stewar": 28453, "jonny": 28454, "entity</w>": 28455, "pacqui": 28456, "mindful</w>": 28457, "minindia</w>": 28458, "bearded</w>": 28459, "tempt</w>": 28460, "scorpion</w>": 28461, "eaton</w>": 28462, "authorized</w>": 28463, "arto": 28464, "svp</w>": 28465, "opathy</w>": 28466, "cchini</w>": 28467, "housemusic</w>": 28468, "disneyworld</w>": 28469, "âĢĶ@</w>": 28470, "propose</w>": 28471, "diy": 28472, "expense</w>": 28473, "teng</w>": 28474, "puppets</w>": 28475, "smel": 28476, "daca</w>": 28477, "perry": 28478, "finn": 28479, "boosting</w>": 28480, "leftovers</w>": 28481, "cougs</w>": 28482, "satellites</w>": 28483, "many": 28484, "aze</w>": 28485, "gong</w>": 28486, "fie": 28487, "methodo": 28488, "ferries</w>": 28489, "ðŁ¤ĶðŁ¤Ķ</w>": 28490, "explorers</w>": 28491, "loader</w>": 28492, "attracted</w>": 28493, "ilton</w>": 28494, "goddamn</w>": 28495, "piazza</w>": 28496, "doctr": 28497, "saving": 28498, "paragraph</w>": 28499, "visualization</w>": 28500, "mayors</w>": 28501, "workflow</w>": 28502, "ackles</w>": 28503, "ðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤ": 28504, "à¤¸</w>": 28505, "twerk</w>": 28506, "clut": 28507, "lover": 28508, "teases</w>": 28509, "sian": 28510, "ote": 28511, "deterior": 28512, "accord</w>": 28513, "lfw</w>": 28514, "swarovski</w>": 28515, "natal</w>": 28516, "traps</w>": 28517, "kina</w>": 28518, "analyze</w>": 28519, "layered</w>": 28520, "beverages</w>": 28521, "unit": 28522, "ransom": 28523, "peshaw": 28524, "destined</w>": 28525, "astrology</w>": 28526, "sipping</w>": 28527, "mileycyrus</w>": 28528, "camino</w>": 28529, "marshmallow</w>": 28530, "bliss": 28531, "outback</w>": 28532, "faq</w>": 28533, "intoler": 28534, "humility</w>": 28535, "poppin</w>": 28536, "halloween": 28537, "montene": 28538, "ophy": 28539, "nun</w>": 28540, "tattooed</w>": 28541, "aas": 28542, "ðŁĮ³</w>": 28543, "daley</w>": 28544, "quality": 28545, "dusa</w>": 28546, "fishermen</w>": 28547, "swif": 28548, "terrac": 28549, "stau": 28550, "lein</w>": 28551, "trolling</w>": 28552, "shipment</w>": 28553, "gardener</w>": 28554, "marchmadness</w>": 28555, "headband</w>": 28556, "grt</w>": 28557, "burnett</w>": 28558, "wand</w>": 28559, "!!!!!!!!!</w>": 28560, "ghe</w>": 28561, "dux</w>": 28562, "hud</w>": 28563, "warner": 28564, "ðŁĩ¦</w>": 28565, "exile</w>": 28566, "rescue": 28567, "rata</w>": 28568, "dhan</w>": 28569, "ducati</w>": 28570, "drown</w>": 28571, "blends</w>": 28572, "spie": 28573, "alligator</w>": 28574, "simultaneously</w>": 28575, "brooke": 28576, "uke</w>": 28577, "khar</w>": 28578, "communion</w>": 28579, "rika</w>": 28580, "fordfc</w>": 28581, "chinatown</w>": 28582, "yourown": 28583, "mey": 28584, "canal": 28585, "systematic</w>": 28586, "depri": 28587, "oxford": 28588, "anil": 28589, "wut</w>": 28590, "equation</w>": 28591, "bez": 28592, "fleur</w>": 28593, "thegood": 28594, "langley</w>": 28595, "adity": 28596, "edith</w>": 28597, "alfie</w>": 28598, "Ð¾ÑĤ": 28599, "encry": 28600, "brill</w>": 28601, "exemp": 28602, "cesar</w>": 28603, "mbling</w>": 28604, "abri": 28605, "scicom": 28606, "jing</w>": 28607, "schooling</w>": 28608, "mika": 28609, "mechanisms</w>": 28610, "impromptu</w>": 28611, "rhea</w>": 28612, "moore": 28613, "crimea</w>": 28614, "besto": 28615, "wright": 28616, "elders</w>": 28617, "rods</w>": 28618, "kamal</w>": 28619, "folklore</w>": 28620, "beet</w>": 28621, "minion</w>": 28622, "relieve</w>": 28623, "thro</w>": 28624, "teamusa</w>": 28625, "pascal</w>": 28626, "madewith": 28627, "bolivia</w>": 28628, "itti</w>": 28629, "freebies</w>": 28630, "desired</w>": 28631, "bestselling</w>": 28632, "liness</w>": 28633, "laden</w>": 28634, "keane</w>": 28635, "mists</w>": 28636, "hippie</w>": 28637, "attachment</w>": 28638, "@/</w>": 28639, "sew</w>": 28640, "flanagan</w>": 28641, "âĿĹï¸ı": 28642, "supremac": 28643, "stlcards</w>": 28644, "sias</w>": 28645, "qu</w>": 28646, "rhys</w>": 28647, "steep": 28648, "valleys</w>": 28649, "vw": 28650, "paving</w>": 28651, "dispat": 28652, "alison": 28653, "porte</w>": 28654, "idu</w>": 28655, "newsc": 28656, "socket</w>": 28657, "mos": 28658, "costar": 28659, "revo": 28660, "proteins</w>": 28661, "stanleycup</w>": 28662, "mcal": 28663, "earring</w>": 28664, "secs</w>": 28665, "mclean</w>": 28666, "capric": 28667, "nickelo": 28668, "aden": 28669, "vc": 28670, "shouse</w>": 28671, "adaptive</w>": 28672, "maximize</w>": 28673, "entertainer</w>": 28674, "prose</w>": 28675, "griffi": 28676, "sixteen</w>": 28677, "lamar": 28678, "mirage</w>": 28679, "saudiarabia</w>": 28680, "aweather</w>": 28681, "rust": 28682, "infiltr": 28683, "fashionweek</w>": 28684, "ðŁĺĬðŁĺĬðŁĺĬ</w>": 28685, "selective</w>": 28686, "bubble": 28687, "aden</w>": 28688, "fennel</w>": 28689, "decisive</w>": 28690, "mta</w>": 28691, "mocking": 28692, "mbles</w>": 28693, "stamp": 28694, "mule</w>": 28695, "bernardo</w>": 28696, "grin</w>": 28697, "pott": 28698, "jingle</w>": 28699, "vettel</w>": 28700, "colombian</w>": 28701, "camo": 28702, "motivationmonday</w>": 28703, "bahan</w>": 28704, "ply</w>": 28705, "dhary</w>": 28706, "kami</w>": 28707, "xmen</w>": 28708, "sleeper</w>": 28709, "gara</w>": 28710, "mysti": 28711, "confidential</w>": 28712, "conflicts</w>": 28713, "pneu": 28714, "ces": 28715, "insurtech</w>": 28716, "cleanse</w>": 28717, "merely</w>": 28718, "vais</w>": 28719, "tux": 28720, "thegreat": 28721, "sharon": 28722, "maj</w>": 28723, "hola</w>": 28724, "ecosystems</w>": 28725, "ajay</w>": 28726, "aaj": 28727, "hush</w>": 28728, "harmon</w>": 28729, "backtoschool</w>": 28730, "wikileaks</w>": 28731, "reflected</w>": 28732, "ðŁĺĵ</w>": 28733, "commemorating</w>": 28734, "acet": 28735, "buckingham</w>": 28736, "messiah</w>": 28737, "tuous</w>": 28738, "hornet</w>": 28739, "tobe</w>": 28740, "dq</w>": 28741, "heine": 28742, "mig</w>": 28743, "plate": 28744, "nicholson</w>": 28745, "spie</w>": 28746, "cumberland</w>": 28747, "normal": 28748, "phobia</w>": 28749, "happyhalloween</w>": 28750, "cityfc</w>": 28751, "mcel": 28752, "gillian</w>": 28753, "keto</w>": 28754, "lude</w>": 28755, "demise</w>": 28756, "suga</w>": 28757, "strate</w>": 28758, "mcgrath</w>": 28759, "visitscotland</w>": 28760, "fooled</w>": 28761, "cbr</w>": 28762, "gcse</w>": 28763, "colori": 28764, "potd</w>": 28765, "missuniverse</w>": 28766, "finances</w>": 28767, "mapoli</w>": 28768, "forks</w>": 28769, "Ø´": 28770, "cannon": 28771, "medicinal</w>": 28772, "ðŁĹĵ</w>": 28773, "kho</w>": 28774, "wreck": 28775, "panto</w>": 28776, "bagel</w>": 28777, "gull</w>": 28778, "syndicate</w>": 28779, "icy": 28780, "prc</w>": 28781, "kien</w>": 28782, "zika</w>": 28783, "tish</w>": 28784, "peta</w>": 28785, "cco</w>": 28786, "liza</w>": 28787, "chut": 28788, "extraction</w>": 28789, "elg": 28790, "gli</w>": 28791, "fueled</w>": 28792, "posit": 28793, "respectively</w>": 28794, "leicester": 28795, "brink</w>": 28796, "vulnerability</w>": 28797, "imported</w>": 28798, "esha</w>": 28799, "ðŁ¦ħ</w>": 28800, "rural": 28801, "rell": 28802, "gaming": 28803, "atlantic": 28804, "abandon</w>": 28805, "noah": 28806, "resolved</w>": 28807, "prostate</w>": 28808, "allergic</w>": 28809, "psd</w>": 28810, "âĺ¹": 28811, "dungeon": 28812, "fangirl</w>": 28813, "illuminated</w>": 28814, "mhs</w>": 28815, "whitesox</w>": 28816, "dently</w>": 28817, "cko</w>": 28818, "endorse</w>": 28819, "overly</w>": 28820, "dazzling</w>": 28821, "prioriti": 28822, "nightlife</w>": 28823, "util": 28824, "behave</w>": 28825, "flamen": 28826, "eastbound</w>": 28827, "ðŁĴŁ</w>": 28828, "iloveyou</w>": 28829, "govuk</w>": 28830, "mozambique</w>": 28831, "allegi": 28832, "dri</w>": 28833, "testimonial</w>": 28834, "aths</w>": 28835, "ì§Ģ": 28836, "mmy": 28837, "shabby</w>": 28838, "prosecco</w>": 28839, "friendships</w>": 28840, "calam": 28841, "damages</w>": 28842, "offset</w>": 28843, "jurassic": 28844, "juno</w>": 28845, "arrell</w>": 28846, "ðŁĴ©</w>": 28847, "interventions</w>": 28848, "daredevil</w>": 28849, "carver</w>": 28850, "runaway</w>": 28851, "rane</w>": 28852, "trustees</w>": 28853, "haute</w>": 28854, "depths</w>": 28855, "ðŁİŃ</w>": 28856, "mein": 28857, "sacrifices</w>": 28858, "concier": 28859, "nesting</w>": 28860, "izzy</w>": 28861, "metam": 28862, "ilovemy": 28863, "urine</w>": 28864, "dulu": 28865, "malhotra</w>": 28866, "veins</w>": 28867, "nightly</w>": 28868, "coat": 28869, "andi": 28870, "hewitt</w>": 28871, "lonel": 28872, "cible</w>": 28873, "write": 28874, "jennie</w>": 28875, "santac": 28876, "ĸï¸ı</w>": 28877, "strato": 28878, "singapore": 28879, "soprano</w>": 28880, "kristen": 28881, "cheerful</w>": 28882, "fleetwood</w>": 28883, "fairi": 28884, "meli": 28885, "wast": 28886, "turnt</w>": 28887, "sforsale</w>": 28888, "scrolling</w>": 28889, "angelina</w>": 28890, "rendition</w>": 28891, "jericho</w>": 28892, "nicky": 28893, "orb": 28894, "flavo": 28895, "patriot": 28896, "asheville</w>": 28897, "sickness</w>": 28898, "refund</w>": 28899, "aggression</w>": 28900, "bpl</w>": 28901, "ãĥĥ": 28902, "elusive</w>": 28903, "thistory</w>": 28904, "hanger</w>": 28905, "buffs</w>": 28906, "villas</w>": 28907, "atkinson</w>": 28908, "sph": 28909, "jait": 28910, "declined</w>": 28911, "wok</w>": 28912, "supremacy</w>": 28913, "ootball</w>": 28914, "eyang</w>": 28915, "ðŁİĵ": 28916, "sford</w>": 28917, "athi</w>": 28918, "consume</w>": 28919, "roadster</w>": 28920, "eso</w>": 28921, "upro": 28922, "recipe": 28923, "auf</w>": 28924, "uci</w>": 28925, "aron</w>": 28926, "oooh</w>": 28927, "csgo</w>": 28928, "reich</w>": 28929, "mcd</w>": 28930, "minute": 28931, "ladies": 28932, "punk": 28933, "rutgers</w>": 28934, "meek</w>": 28935, "arizon": 28936, "taj": 28937, "landlord</w>": 28938, "degra": 28939, "autumn": 28940, "lynx</w>": 28941, "usf</w>": 28942, "bhi": 28943, "fairytale</w>": 28944, "donghae</w>": 28945, "betsy</w>": 28946, "exploded</w>": 28947, "chennai": 28948, "opa</w>": 28949, "protag": 28950, "brant": 28951, "ðŁĵ°:</w>": 28952, "gf": 28953, "palli": 28954, "ðŁı¼âĢįâĻĢï¸ı</w>": 28955, "sut</w>": 28956, "illini</w>": 28957, "columnist</w>": 28958, "shirtless</w>": 28959, "decentr": 28960, "searched</w>": 28961, "ecor": 28962, "buggy</w>": 28963, "sack": 28964, "ðŁĺĤðŁĺŃ": 28965, "det": 28966, "theri": 28967, "ornaments</w>": 28968, "bringback": 28969, "tov</w>": 28970, "quarterfinals</w>": 28971, "iche": 28972, "constra": 28973, "gier</w>": 28974, "buchanan</w>": 28975, "vix": 28976, "kayaking</w>": 28977, "mustread</w>": 28978, "swallow</w>": 28979, "melb</w>": 28980, "scaf": 28981, "opal</w>": 28982, "mayoral</w>": 28983, "harat</w>": 28984, "ðŁ¦ĭ</w>": 28985, "schedules</w>": 28986, "idf</w>": 28987, "hague</w>": 28988, "roz": 28989, "aah</w>": 28990, "dmc</w>": 28991, "duplic": 28992, "cache</w>": 28993, "orphan</w>": 28994, "fracture</w>": 28995, "recon</w>": 28996, "chav": 28997, "bunnies</w>": 28998, "alain</w>": 28999, "mustafa</w>": 29000, "ðŁİĻ": 29001, "vacations</w>": 29002, "dynamite</w>": 29003, "texted</w>": 29004, "broadcaster</w>": 29005, "ðŁĴ£</w>": 29006, "steamed</w>": 29007, "rocker</w>": 29008, "dietary</w>": 29009, "luxurytravel</w>": 29010, "inaugurated</w>": 29011, "sawards</w>": 29012, "vaughn</w>": 29013, "lincolnshire</w>": 29014, "clicked</w>": 29015, "kraja</w>": 29016, "fanc": 29017, "removes</w>": 29018, "layoffs</w>": 29019, "mcfar": 29020, "breeds</w>": 29021, "winnie</w>": 29022, "jonghyun</w>": 29023, "incentive</w>": 29024, "variations</w>": 29025, "patton</w>": 29026, "aturday</w>": 29027, "persistent</w>": 29028, "prun": 29029, "piers</w>": 29030, "dales</w>": 29031, "æĸ": 29032, "breastfeeding</w>": 29033, "rance</w>": 29034, "tawa</w>": 29035, "Ĥâĸ": 29036, "murdoch</w>": 29037, "captive</w>": 29038, "thistle</w>": 29039, "nica</w>": 29040, "commodity</w>": 29041, "couldnt</w>": 29042, "boardwalk</w>": 29043, "gracious</w>": 29044, "practitioners</w>": 29045, "ngc</w>": 29046, "scrum</w>": 29047, "nero</w>": 29048, "camouflage</w>": 29049, "colon</w>": 29050, "hei</w>": 29051, "physicist</w>": 29052, "saturdaymorning</w>": 29053, "tener</w>": 29054, "siwon</w>": 29055, "columns</w>": 29056, "brune": 29057, "yvr</w>": 29058, "bair": 29059, "retires</w>": 29060, "halam": 29061, "caber": 29062, "shazam</w>": 29063, "minu": 29064, "cascade</w>": 29065, "milkshake</w>": 29066, "grid": 29067, "dren": 29068, "vincent": 29069, "sodium</w>": 29070, "platter</w>": 29071, "cheerleader</w>": 29072, "chenko</w>": 29073, "yak</w>": 29074, "eliminated</w>": 29075, "typo</w>": 29076, "yman</w>": 29077, "rethink</w>": 29078, "âĿĹ</w>": 29079, "tsville</w>": 29080, "bernardokath</w>": 29081, "extr": 29082, "ðŁĺģðŁĺģðŁĺģ</w>": 29083, "tao": 29084, "reper": 29085, "moths</w>": 29086, "empowered</w>": 29087, "citing</w>": 29088, "transported</w>": 29089, "monks</w>": 29090, "sanat": 29091, "clears</w>": 29092, "bachelorette</w>": 29093, "campbell": 29094, "rachael</w>": 29095, "harle": 29096, "handler</w>": 29097, "climbs</w>": 29098, "interference</w>": 29099, "release": 29100, "shand": 29101, "rbs</w>": 29102, "hrh</w>": 29103, "ãģª": 29104, "valle</w>": 29105, "rÃ©": 29106, "slime</w>": 29107, "wakes</w>": 29108, "chubby</w>": 29109, "sloan</w>": 29110, "elves</w>": 29111, "athen": 29112, "attorneys</w>": 29113, "microscope</w>": 29114, "stoner</w>": 29115, "scaling</w>": 29116, "obe</w>": 29117, "cout": 29118, "seman": 29119, "midweek</w>": 29120, "balsam": 29121, "ðŁĺįâĿ¤</w>": 29122, "tiful</w>": 29123, "vish</w>": 29124, "lotta</w>": 29125, "ripping</w>": 29126, "remn": 29127, "tire": 29128, "leap": 29129, "havent</w>": 29130, "laby": 29131, "himach": 29132, "whispers</w>": 29133, "wein": 29134, "ðŁİ¸": 29135, "wildflowers</w>": 29136, "sele": 29137, "ucc</w>": 29138, "liability</w>": 29139, "azine</w>": 29140, "swings</w>": 29141, "kya</w>": 29142, "tair": 29143, "remain": 29144, "edo": 29145, "flops</w>": 29146, "pocket": 29147, "grandad</w>": 29148, "examiner</w>": 29149, "gris</w>": 29150, "ffect</w>": 29151, "ðŁĳĬðŁı»</w>": 29152, "studded</w>": 29153, "heartbeat</w>": 29154, "deacon</w>": 29155, "firmly</w>": 29156, "infectious</w>": 29157, "stef": 29158, "outlines</w>": 29159, "leasing</w>": 29160, "claws</w>": 29161, "sense": 29162, "tabs</w>": 29163, "hoot</w>": 29164, "mosul</w>": 29165, "spawn</w>": 29166, "coa</w>": 29167, "hogwarts</w>": 29168, "vein</w>": 29169, "albania</w>": 29170, "manuel": 29171, "bino": 29172, "vauxhall</w>": 29173, "scotland": 29174, "gobucks</w>": 29175, "matty</w>": 29176, "physio</w>": 29177, "torino</w>": 29178, "constable</w>": 29179, "investigated</w>": 29180, "slower</w>": 29181, "mistaken</w>": 29182, "bayer</w>": 29183, "wildfires</w>": 29184, "voic": 29185, "xon": 29186, "timeto": 29187, "chassis</w>": 29188, "barric": 29189, "pion</w>": 29190, "baldhead</w>": 29191, "wook</w>": 29192, "registr": 29193, "drafts</w>": 29194, "bhs</w>": 29195, "ligue</w>": 29196, "lick": 29197, "staffordshire</w>": 29198, "bafta</w>": 29199, "darry": 29200, "jeanne</w>": 29201, "vending</w>": 29202, "corp": 29203, "âĽ³ï¸ı</w>": 29204, "kiddos</w>": 29205, "fenway</w>": 29206, "cao</w>": 29207, "westbound</w>": 29208, "ðŁĺĻ</w>": 29209, "dvr</w>": 29210, "quicker</w>": 29211, "blah</w>": 29212, "goodie</w>": 29213, "ðŁĴĭðŁĴĭ</w>": 29214, "vox": 29215, "esper": 29216, "facade</w>": 29217, "correlation</w>": 29218, "redbull</w>": 29219, "roup</w>": 29220, "declining</w>": 29221, "chive</w>": 29222, "mcgee</w>": 29223, "turo</w>": 29224, "inder</w>": 29225, "feller</w>": 29226, "fug": 29227, "ilysm</w>": 29228, "mardi</w>": 29229, "peshawar</w>": 29230, "kieran</w>": 29231, "inema</w>": 29232, "meatballs</w>": 29233, "peck</w>": 29234, "depressing</w>": 29235, "sensing</w>": 29236, "giz": 29237, "ddington</w>": 29238, "springwatch</w>": 29239, "roaming</w>": 29240, "yellowstone</w>": 29241, "horseshoe</w>": 29242, "amman</w>": 29243, "weekday</w>": 29244, "olor</w>": 29245, "ðŁ¥°": 29246, "boosts</w>": 29247, "sprint": 29248, "scarves</w>": 29249, "jee": 29250, "beetro": 29251, "clan": 29252, "allthe": 29253, "ìĦ¸ë": 29254, "enlightenment</w>": 29255, "adobe": 29256, "regeneration</w>": 29257, "?@</w>": 29258, "contag": 29259, "yachts</w>": 29260, "tou</w>": 29261, "mora</w>": 29262, "envoy</w>": 29263, "rani": 29264, "goli": 29265, "dhanushkraja</w>": 29266, "woodworking</w>": 29267, "strengths</w>": 29268, "sedi": 29269, "discs</w>": 29270, "arina</w>": 29271, "scon</w>": 29272, "lite": 29273, "another": 29274, "ðŁ¥Ĭ</w>": 29275, "yemen": 29276, "guern": 29277, "savvy</w>": 29278, "loyed</w>": 29279, "biomed": 29280, "heartbreak</w>": 29281, "comrades</w>": 29282, "millie</w>": 29283, "patch": 29284, "unf": 29285, "jarvis</w>": 29286, "blaming</w>": 29287, "commemoration</w>": 29288, "gey</w>": 29289, "å¥": 29290, "cardiovascular</w>": 29291, "aligned</w>": 29292, "document": 29293, ".?</w>": 29294, "aesthetics</w>": 29295, "emu</w>": 29296, "theirs</w>": 29297, "leh</w>": 29298, "psic": 29299, "sif</w>": 29300, "plateau</w>": 29301, "expend": 29302, "dominating</w>": 29303, "robes</w>": 29304, "mauritius</w>": 29305, "exceptionally</w>": 29306, "homer": 29307, "discoveries</w>": 29308, "braun</w>": 29309, "tennant</w>": 29310, "insulin</w>": 29311, "ðŁİ®</w>": 29312, "carbs</w>": 29313, "teas</w>": 29314, "?!\"</w>": 29315, "zie": 29316, "francois</w>": 29317, "browsing</w>": 29318, "thol": 29319, "clarence</w>": 29320, "helper</w>": 29321, "obtained</w>": 29322, "cassie</w>": 29323, "lees": 29324, "!,</w>": 29325, "pomegran": 29326, "hubs</w>": 29327, "prestige</w>": 29328, "][</w>": 29329, "macher</w>": 29330, "bottled</w>": 29331, "punch": 29332, "pipe": 29333, "och": 29334, "gallons</w>": 29335, "deliveries</w>": 29336, "ura": 29337, "unday</w>": 29338, "monde</w>": 29339, "depicts</w>": 29340, "regency</w>": 29341, "outrageous</w>": 29342, "khaled</w>": 29343, "caro</w>": 29344, "hearti": 29345, "zag</w>": 29346, "developmental</w>": 29347, "overcoming</w>": 29348, "statistical</w>": 29349, "flavored</w>": 29350, "fords</w>": 29351, "creatives</w>": 29352, "laurence</w>": 29353, "dias</w>": 29354, "sunscreen</w>": 29355, "inked</w>": 29356, "preacher</w>": 29357, "nul": 29358, "impacting</w>": 29359, "autistic</w>": 29360, "âļĶï¸ı</w>": 29361, "oss": 29362, "pelicans</w>": 29363, "celeste</w>": 29364, "vb": 29365, "rump</w>": 29366, "mcgra": 29367, "fairfax</w>": 29368, "humor": 29369, "bbcnews</w>": 29370, "rowling</w>": 29371, "calder": 29372, "seamless</w>": 29373, "agne": 29374, "pti": 29375, "mixed": 29376, "tshirts</w>": 29377, "merci</w>": 29378, "btob</w>": 29379, "womeninstem</w>": 29380, "genealogy</w>": 29381, "preven": 29382, "lour": 29383, "cradle</w>": 29384, "giuse": 29385, "Ð¾</w>": 29386, "chrono": 29387, "fairness</w>": 29388, "chocolate": 29389, "tory": 29390, "asda</w>": 29391, "prescott</w>": 29392, "stretched</w>": 29393, "alman": 29394, "uil</w>": 29395, "recharge</w>": 29396, "intre": 29397, "obst": 29398, "hospital": 29399, "hayward</w>": 29400, "tenerife</w>": 29401, "friedman</w>": 29402, "vaping</w>": 29403, "confessions</w>": 29404, "yeah": 29405, "balli": 29406, "lucknow</w>": 29407, "corpse</w>": 29408, "sculptor</w>": 29409, "ampton": 29410, "tpp</w>": 29411, "indicates</w>": 29412, "surplus</w>": 29413, "truman</w>": 29414, "ðĿĻ": 29415, "sinha</w>": 29416, "invo": 29417, "sovereign": 29418, "kev</w>": 29419, "establishing</w>": 29420, "engraved</w>": 29421, "assuming</w>": 29422, "ðŁıģ": 29423, "souza</w>": 29424, "fabi": 29425, "toned</w>": 29426, "ounge</w>": 29427, "deloit": 29428, "downey</w>": 29429, "noble": 29430, "omor": 29431, "cartridge</w>": 29432, "ðŁıĲ</w>": 29433, "uhur": 29434, "holloway</w>": 29435, "successes</w>": 29436, "rsa</w>": 29437, "âĦ¢": 29438, "mazz": 29439, "twd": 29440, "discourse</w>": 29441, ".<</w>": 29442, "yat": 29443, "satisfy</w>": 29444, "compri": 29445, "à¤¹</w>": 29446, "graphite</w>": 29447, "dissertation</w>": 29448, "arter": 29449, "íĶ": 29450, "bally</w>": 29451, "zombi": 29452, "lyons</w>": 29453, "aic": 29454, "ubc</w>": 29455, "prada</w>": 29456, "eil": 29457, "dax</w>": 29458, "clai": 29459, "granddaughter</w>": 29460, "extravaganza</w>": 29461, "challenge": 29462, "ðŁ¤ŀ": 29463, "pover</w>": 29464, "primarily</w>": 29465, "daddy": 29466, "mana": 29467, "bikers</w>": 29468, "inquiries</w>": 29469, "daun": 29470, "feline</w>": 29471, "generative</w>": 29472, "hef": 29473, "benefiting</w>": 29474, "lindsey": 29475, "polka</w>": 29476, "demonstrated</w>": 29477, "alle</w>": 29478, "randy": 29479, "osu": 29480, "lowkey</w>": 29481, "weirdest</w>": 29482, "redbull": 29483, "oury</w>": 29484, "nous</w>": 29485, "woodstock</w>": 29486, "credenti": 29487, "nicer</w>": 29488, "gado</w>": 29489, "alyss": 29490, "aph</w>": 29491, "preparedness</w>": 29492, "stationary</w>": 29493, "incorporated</w>": 29494, "dyer</w>": 29495, "saratoga</w>": 29496, "celesti": 29497, ":\"": 29498, "antibiotics</w>": 29499, "orgs</w>": 29500, "indefin": 29501, "apron</w>": 29502, "Ð¸Ð": 29503, "fifteen</w>": 29504, "nof": 29505, "ðŁĶĿ</w>": 29506, "phx</w>": 29507, "tega</w>": 29508, "mz": 29509, "organizational</w>": 29510, "onair</w>": 29511, "bandung</w>": 29512, "pleasures</w>": 29513, "mori</w>": 29514, "secretari": 29515, "raccoon</w>": 29516, "cashi": 29517, "pilates</w>": 29518, "kon</w>": 29519, "geoffrey</w>": 29520, "lao</w>": 29521, "kamp</w>": 29522, "departments</w>": 29523, "backpacking</w>": 29524, "anam": 29525, "Ã«": 29526, "crackdown</w>": 29527, "aunty</w>": 29528, "ondo</w>": 29529, "lizzie</w>": 29530, "phers</w>": 29531, "cun</w>": 29532, "ðŁĩ±": 29533, "kpop": 29534, "put": 29535, "intentional</w>": 29536, "connolly</w>": 29537, "barclays</w>": 29538, "hsfb</w>": 29539, "swindon</w>": 29540, "uku": 29541, "sally": 29542, "aint": 29543, "âľħ": 29544, "penang</w>": 29545, "uplifting</w>": 29546, "epilepsy</w>": 29547, "interro": 29548, "bungal": 29549, "goku</w>": 29550, "blueberries</w>": 29551, "à¤¦</w>": 29552, "ussia</w>": 29553, "silky</w>": 29554, "moured</w>": 29555, "istic</w>": 29556, "briefs</w>": 29557, "meats</w>": 29558, "gob": 29559, "chaser</w>": 29560, "statewide</w>": 29561, "prasad</w>": 29562, "glitch</w>": 29563, "arin": 29564, "banff</w>": 29565, "member": 29566, "ðŁĺŃâĿ¤ï¸ı</w>": 29567, "loving": 29568, "halla</w>": 29569, "à¸¡</w>": 29570, "smokers</w>": 29571, "yaku": 29572, "scicomm</w>": 29573, "physio": 29574, "swol": 29575, "lemons</w>": 29576, "gelato</w>": 29577, "chool</w>": 29578, "capitals</w>": 29579, "kistan</w>": 29580, "tights</w>": 29581, "spikes</w>": 29582, "travellers</w>": 29583, "iklan</w>": 29584, "commissioning</w>": 29585, "arine</w>": 29586, "emabiggestfans</w>": 29587, "emphasis</w>": 29588, "frontline</w>": 29589, "paddock</w>": 29590, "destructive</w>": 29591, "baha": 29592, "linger</w>": 29593, "jewish": 29594, "shetland</w>": 29595, "mcgin": 29596, "monkey": 29597, "koz": 29598, "sone</w>": 29599, "rajini": 29600, "teh</w>": 29601, "yen": 29602, "cvs</w>": 29603, "masquer": 29604, "girly</w>": 29605, "wesle": 29606, "wasnt</w>": 29607, "brody</w>": 29608, "terminator</w>": 29609, "gille": 29610, "maggi": 29611, "birdie</w>": 29612, "jeopardy</w>": 29613, "cubic</w>": 29614, "vmware</w>": 29615, "intricate</w>": 29616, "anup": 29617, "topia</w>": 29618, "easton</w>": 29619, "sabres</w>": 29620, "investigates</w>": 29621, "busting</w>": 29622, "bilingual</w>": 29623, "valentino</w>": 29624, "informat": 29625, "ferre": 29626, "adventur": 29627, "hydrate</w>": 29628, "forsy": 29629, "aziz</w>": 29630, "santo": 29631, "ede": 29632, "whistler</w>": 29633, "continuously</w>": 29634, "dham": 29635, "unused</w>": 29636, "jihad</w>": 29637, "addictive</w>": 29638, "vidy": 29639, "dob": 29640, "ido</w>": 29641, "fied": 29642, "niversary</w>": 29643, "none": 29644, "fuer": 29645, "ðŁĺįðŁĺĺ": 29646, "covenant</w>": 29647, "printable</w>": 29648, "immaculate</w>": 29649, "oem</w>": 29650, "clt": 29651, "servants</w>": 29652, "consumed</w>": 29653, "unreleased</w>": 29654, "scum</w>": 29655, "packaged</w>": 29656, "mere": 29657, "ìĦ¸ë¸": 29658, "toby": 29659, "taf": 29660, "spoons</w>": 29661, "meal": 29662, "fball</w>": 29663, "fairfield</w>": 29664, "janet": 29665, "silverstone</w>": 29666, "dartmouth</w>": 29667, "followme</w>": 29668, "voyager</w>": 29669, "kombat</w>": 29670, "anniver": 29671, "enew": 29672, "magdal": 29673, "hove</w>": 29674, "sath": 29675, "grizzly</w>": 29676, "cardi</w>": 29677, "gartner</w>": 29678, "sandy": 29679, "kanye": 29680, "posture</w>": 29681, "poign": 29682, "impulse</w>": 29683, "radiology</w>": 29684, "horizons</w>": 29685, "siam": 29686, "aishwar": 29687, "==></w>": 29688, "noche</w>": 29689, "tris</w>": 29690, "elyn": 29691, "comme</w>": 29692, "dui</w>": 29693, "cec": 29694, "councillors</w>": 29695, "cuddling</w>": 29696, "creeping</w>": 29697, "locke</w>": 29698, "manages</w>": 29699, "transferred</w>": 29700, "necks</w>": 29701, "dier": 29702, "dano</w>": 29703, "vick</w>": 29704, "lunches</w>": 29705, "dhe": 29706, "ensures</w>": 29707, "criss</w>": 29708, "ulster": 29709, "bannon</w>": 29710, "contenders</w>": 29711, "spam": 29712, "sweetness</w>": 29713, "medal": 29714, "honduras</w>": 29715, "arctic": 29716, "ultrasound</w>": 29717, "infr": 29718, "discovers</w>": 29719, "eiffel</w>": 29720, "casters</w>": 29721, "ruben</w>": 29722, "dust": 29723, "aweed</w>": 29724, "atrium</w>": 29725, "lestwe": 29726, "seared</w>": 29727, "ðŁĵº:</w>": 29728, "tyne</w>": 29729, "exchanges</w>": 29730, "littlemix</w>": 29731, "lle</w>": 29732, "astronauts</w>": 29733, "hershey</w>": 29734, "workday</w>": 29735, "knob</w>": 29736, "sov</w>": 29737, "resigns</w>": 29738, "todayshow</w>": 29739, "derman</w>": 29740, "anth</w>": 29741, "afc": 29742, "taster</w>": 29743, "swoo": 29744, "saeed</w>": 29745, "pering</w>": 29746, "narrowly</w>": 29747, "rnli</w>": 29748, "bestbuy</w>": 29749, "panasonic</w>": 29750, "obstacle</w>": 29751, "farmers": 29752, "ðŁİĻ</w>": 29753, "pawan": 29754, "kiest</w>": 29755, "angers</w>": 29756, "absurd</w>": 29757, "ohmy": 29758, "sino</w>": 29759, "pistachi": 29760, "spice": 29761, "giuli": 29762, "primetime</w>": 29763, "kow": 29764, "kens</w>": 29765, "exagger": 29766, "!?!</w>": 29767, "uba</w>": 29768, "middles": 29769, "judd</w>": 29770, "ejec": 29771, "slammed</w>": 29772, "pensions</w>": 29773, "ofa</w>": 29774, "recreate</w>": 29775, "bhp</w>": 29776, "xxl</w>": 29777, "liverpool": 29778, "thresh": 29779, "purity</w>": 29780, "nieu": 29781, "holics</w>": 29782, "wrath</w>": 29783, "rado</w>": 29784, "glio</w>": 29785, "amma</w>": 29786, "dilemma</w>": 29787, "cru</w>": 29788, "letsgo</w>": 29789, "....@</w>": 29790, "âĿĵ</w>": 29791, "suggesting</w>": 29792, "trumps</w>": 29793, "horus</w>": 29794, "fv": 29795, "icom</w>": 29796, "referring</w>": 29797, "predictive</w>": 29798, "tarts</w>": 29799, "gette</w>": 29800, "sock": 29801, "glossy</w>": 29802, "pinky</w>": 29803, "alec": 29804, "thyme</w>": 29805, "oura": 29806, "theroad</w>": 29807, "petr": 29808, "cram": 29809, "pfi": 29810, "dvn</w>": 29811, "meier</w>": 29812, "incentives</w>": 29813, "tunnels</w>": 29814, "mobil</w>": 29815, "recap": 29816, "extras</w>": 29817, "upright</w>": 29818, "revamp</w>": 29819, "perseverance</w>": 29820, ",-</w>": 29821, "otp</w>": 29822, "mirror": 29823, "arwx</w>": 29824, "gerry": 29825, "maher</w>": 29826, "gor</w>": 29827, "homepage</w>": 29828, "amis</w>": 29829, "agra</w>": 29830, "madele": 29831, "bestfriend": 29832, "siriusxm</w>": 29833, "bundles</w>": 29834, "admiring</w>": 29835, "tdsb</w>": 29836, "ðŁįģ": 29837, "chas</w>": 29838, "slowing</w>": 29839, "roh</w>": 29840, "wallpapers</w>": 29841, "âĢ¦/</w>": 29842, "tekken</w>": 29843, "gangs</w>": 29844, "tala</w>": 29845, "lindsay": 29846, "shoul": 29847, "linebacker</w>": 29848, "toolkit</w>": 29849, "uranium</w>": 29850, "calyp": 29851, "abrams</w>": 29852, "matthi": 29853, "ðŁı¿": 29854, "honourable</w>": 29855, "dayo": 29856, "versail": 29857, "tank": 29858, "stc</w>": 29859, "fritz</w>": 29860, "splend": 29861, "patag": 29862, "annoyed</w>": 29863, "onday</w>": 29864, "devastated</w>": 29865, "chattanooga</w>": 29866, "nationalism</w>": 29867, "massey</w>": 29868, "jenn</w>": 29869, "tailor</w>": 29870, "devgn</w>": 29871, "organs</w>": 29872, "zucchini</w>": 29873, "onfox</w>": 29874, "satire</w>": 29875, "wexford</w>": 29876, "disgrace</w>": 29877, "noto": 29878, "volta": 29879, "âĿ¤ï¸ıâĿ¤ï¸ıâĿ¤ï¸ıâĿ¤ï¸ı</w>": 29880, "à¶": 29881, "homeowners</w>": 29882, "pointer</w>": 29883, "mcr": 29884, "austen</w>": 29885, "daysto": 29886, "moons</w>": 29887, "palma</w>": 29888, "grazing</w>": 29889, "eso": 29890, "influencers</w>": 29891, "shahidkapoor</w>": 29892, "compliant</w>": 29893, "measurements</w>": 29894, "develops</w>": 29895, "yd": 29896, "parl</w>": 29897, "pvt</w>": 29898, "randolph</w>": 29899, "tortured</w>": 29900, "gerald": 29901, "elias</w>": 29902, "deepikap": 29903, "warmup</w>": 29904, "hickory</w>": 29905, "gap": 29906, "coffin</w>": 29907, "amour</w>": 29908, "reneg": 29909, "mounting</w>": 29910, "sevens</w>": 29911, "igle": 29912, "hier</w>": 29913, "decad": 29914, "tright</w>": 29915, "escapes</w>": 29916, "werner</w>": 29917, "tfl</w>": 29918, "fulfilled</w>": 29919, "niger</w>": 29920, "sourdough</w>": 29921, "reaper</w>": 29922, "chooses</w>": 29923, "spinner</w>": 29924, "weeknd</w>": 29925, "filtered</w>": 29926, "shuk": 29927, "kati</w>": 29928, "oldham</w>": 29929, "opensource</w>": 29930, "khanna</w>": 29931, "atelier</w>": 29932, "connec": 29933, "ophobic</w>": 29934, "glas": 29935, "complications</w>": 29936, "arson</w>": 29937, "councils</w>": 29938, "smol": 29939, "assy": 29940, "lurking</w>": 29941, "lingui": 29942, "hanks</w>": 29943, "ein": 29944, "Ùħ</w>": 29945, "rugs</w>": 29946, "nguyen</w>": 29947, "nouveau</w>": 29948, "menace</w>": 29949, "lev</w>": 29950, "aladdin</w>": 29951, "ruining</w>": 29952, "roundabout</w>": 29953, "km": 29954, "conor": 29955, "shoops</w>": 29956, "mayday</w>": 29957, "traumatic</w>": 29958, "prabhas</w>": 29959, "kaiser</w>": 29960, "kita</w>": 29961, "router</w>": 29962, "pedro": 29963, "retar": 29964, "stunner</w>": 29965, "spanish": 29966, "disturbed</w>": 29967, "academy": 29968, "elearning</w>": 29969, "witty</w>": 29970, "seng</w>": 29971, "feral</w>": 29972, "avy</w>": 29973, "stab</w>": 29974, "keaton</w>": 29975, "urdu</w>": 29976, "koto</w>": 29977, "hui</w>": 29978, "cooke</w>": 29979, "arian": 29980, "thepersonal": 29981, "uma": 29982, "seap": 29983, "asting</w>": 29984, "rhetoric</w>": 29985, "handwriting</w>": 29986, "municipality</w>": 29987, "consortium</w>": 29988, "ðŁĲŁ</w>": 29989, "glasgow": 29990, "raya</w>": 29991, "eliza</w>": 29992, "polymer</w>": 29993, "broth</w>": 29994, "practi": 29995, "correspondent</w>": 29996, "addicts</w>": 29997, "gayle</w>": 29998, "ailing</w>": 29999, "ofe": 30000, "pli": 30001, "heartw": 30002, "stitch": 30003, "sightings</w>": 30004, "priests</w>": 30005, "samo": 30006, "sloth</w>": 30007, "goodwood</w>": 30008, "rocco</w>": 30009, "sabc</w>": 30010, "summit": 30011, "lace": 30012, "presley</w>": 30013, "itten": 30014, "cincy": 30015, "thepersonalnetwork</w>": 30016, "sweek</w>": 30017, "pegas": 30018, "afcon</w>": 30019, "registry</w>": 30020, "cim</w>": 30021, "leth": 30022, "dicap": 30023, "candice</w>": 30024, "fluent</w>": 30025, "smack</w>": 30026, "pedestri": 30027, "aloud</w>": 30028, "carac": 30029, "priyankach": 30030, "pgh</w>": 30031, "irons</w>": 30032, "dolce": 30033, "latvia</w>": 30034, "deceased</w>": 30035, "therock</w>": 30036, "clap": 30037, "cene</w>": 30038, "foam": 30039, "morrissey</w>": 30040, "gret": 30041, "essentially</w>": 30042, "comcast</w>": 30043, "beagle</w>": 30044, "argues</w>": 30045, "inged</w>": 30046, "-âĢ¦</w>": 30047, "sag</w>": 30048, "hasan</w>": 30049, "ðŁĻĨ": 30050, "ðŁį°</w>": 30051, "nhra</w>": 30052, "kannada</w>": 30053, "indicators</w>": 30054, "oner": 30055, "brixton</w>": 30056, "atas</w>": 30057, "screenplay</w>": 30058, "sorority</w>": 30059, "shaheed</w>": 30060, "heem</w>": 30061, "classmates</w>": 30062, "tainment</w>": 30063, "esi</w>": 30064, "breastcancer</w>": 30065, "zuckerberg</w>": 30066, "auror": 30067, "encia</w>": 30068, "refers</w>": 30069, "kaeper": 30070, "vortex</w>": 30071, "compart": 30072, "lymph": 30073, "photographing</w>": 30074, "steff": 30075, "restling</w>": 30076, "parsley</w>": 30077, "momento</w>": 30078, "thman</w>": 30079, "lacking</w>": 30080, "dutt</w>": 30081, "oculus</w>": 30082, "fino</w>": 30083, "frenzy</w>": 30084, "rasc": 30085, "dern</w>": 30086, "dismissed</w>": 30087, "nook</w>": 30088, "metgala</w>": 30089, "shill</w>": 30090, "raphael</w>": 30091, "mavericks</w>": 30092, "exhibits</w>": 30093, "eagerly</w>": 30094, "cpa</w>": 30095, "amenities</w>": 30096, ".âłĢ</w>": 30097, "exodus</w>": 30098, "ernst</w>": 30099, "lita</w>": 30100, "dealt</w>": 30101, "womensmarch</w>": 30102, "iain</w>": 30103, "scoreboard</w>": 30104, "campeones</w>": 30105, "cen</w>": 30106, "tiki</w>": 30107, "garrison</w>": 30108, "fidelity</w>": 30109, "brag</w>": 30110, "roadmap</w>": 30111, "psychop": 30112, "loe": 30113, "bleu</w>": 30114, "ðŁĳĬðŁı¼</w>": 30115, "sauvi": 30116, "springer</w>": 30117, "temptation</w>": 30118, "rudolph</w>": 30119, "acura</w>": 30120, "wicz</w>": 30121, "parachute</w>": 30122, "strol": 30123, "lenny</w>": 30124, "zik</w>": 30125, "doms</w>": 30126, "nbaf": 30127, "alpac": 30128, "vivian</w>": 30129, "rove</w>": 30130, "preet</w>": 30131, "perpetu": 30132, "snake": 30133, "airsoft</w>": 30134, "inflatable</w>": 30135, "princes</w>": 30136, "atie</w>": 30137, "ffey</w>": 30138, "patient": 30139, "mire</w>": 30140, "chelle</w>": 30141, "slack</w>": 30142, "groovy</w>": 30143, "#:</w>": 30144, "uploading</w>": 30145, "!!!!!!!!!!!!!!!!": 30146, "siemens</w>": 30147, "provision</w>": 30148, "vfx</w>": 30149, "needy</w>": 30150, "fats</w>": 30151, "topoli</w>": 30152, "bhutto</w>": 30153, "sathletics</w>": 30154, "alums</w>": 30155, "twinning</w>": 30156, "southwestern</w>": 30157, "adopting</w>": 30158, "lastnight</w>": 30159, "manne": 30160, "laga</w>": 30161, "twell</w>": 30162, "acia</w>": 30163, "----</w>": 30164, "eyewear</w>": 30165, "hurley</w>": 30166, "flee</w>": 30167, "sach": 30168, "pecker</w>": 30169, "costly</w>": 30170, "isk</w>": 30171, "crates</w>": 30172, "policy": 30173, "erosion</w>": 30174, "ingo": 30175, "werk</w>": 30176, "ðŁĲį</w>": 30177, "tortoise</w>": 30178, "therapies</w>": 30179, "internet": 30180, "chihuahua</w>": 30181, "rips</w>": 30182, "frei": 30183, "edor": 30184, "taiji</w>": 30185, "tfc</w>": 30186, "dod</w>": 30187, "dempsey</w>": 30188, "christin": 30189, "cheng</w>": 30190, "hips</w>": 30191, "graeme</w>": 30192, "compassionate</w>": 30193, "cavaliers</w>": 30194, "historic": 30195, "soulful</w>": 30196, "criminal": 30197, "jac</w>": 30198, "vinci</w>": 30199, "expired</w>": 30200, "surat</w>": 30201, "turismo</w>": 30202, "kona</w>": 30203, "seaweed</w>": 30204, "berts</w>": 30205, "leica</w>": 30206, "expressing</w>": 30207, "aal</w>": 30208, "wort</w>": 30209, "breakfast": 30210, "herring</w>": 30211, "amused</w>": 30212, "rhubarb</w>": 30213, "martian</w>": 30214, "cosplayer</w>": 30215, "yash": 30216, "strial</w>": 30217, "raul</w>": 30218, "referral</w>": 30219, "dwts</w>": 30220, "jw": 30221, "adler</w>": 30222, "curtains</w>": 30223, "gur</w>": 30224, "valence</w>": 30225, "tyrone</w>": 30226, "swfc</w>": 30227, "coached</w>": 30228, "reborn</w>": 30229, "diabetic</w>": 30230, "choke</w>": 30231, "norfolk": 30232, "investigative</w>": 30233, "ðŁĴ¯ðŁĴ¯": 30234, "zid": 30235, "vmas</w>": 30236, "phie</w>": 30237, "objectives</w>": 30238, "âľĭ</w>": 30239, "overdue</w>": 30240, "divers": 30241, "matsu": 30242, "ðŁİŁï¸ı</w>": 30243, "casualties</w>": 30244, "à¸§</w>": 30245, "alk": 30246, "standardi": 30247, "realist</w>": 30248, "artifacts</w>": 30249, "pandor": 30250, "kex": 30251, "invin": 30252, "(!)</w>": 30253, "iney</w>": 30254, "paraly": 30255, "mrt</w>": 30256, "faye</w>": 30257, "thevoice</w>": 30258, "onga</w>": 30259, "deed</w>": 30260, "skinner</w>": 30261, "azwx</w>": 30262, "specimen</w>": 30263, "priyankachopra</w>": 30264, "nuevo</w>": 30265, "barkley</w>": 30266, "toulouse</w>": 30267, "resumes</w>": 30268, "footballers</w>": 30269, "citi</w>": 30270, "fetch</w>": 30271, "Ã¨re</w>": 30272, "lestweforget</w>": 30273, "ðŁĻĭ</w>": 30274, "chunk</w>": 30275, "drifting</w>": 30276, "manipulation</w>": 30277, "equals</w>": 30278, "putt</w>": 30279, "kyungsoo</w>": 30280, "âĿ¤ï¸ı#</w>": 30281, "elastic</w>": 30282, "parano": 30283, "foy</w>": 30284, "doping</w>": 30285, "cincy</w>": 30286, "ssler</w>": 30287, "interrupted</w>": 30288, "alay": 30289, "adores</w>": 30290, "amethy": 30291, "convoy</w>": 30292, "ãĢı</w>": 30293, "Ĭãģ": 30294, "blacklist</w>": 30295, "generals</w>": 30296, "sachin</w>": 30297, "brushed</w>": 30298, "ounces</w>": 30299, "nonstop</w>": 30300, "illiams</w>": 30301, "btsarmy</w>": 30302, "uav</w>": 30303, "ruff</w>": 30304, "burma</w>": 30305, "bik": 30306, "defence": 30307, "schultz</w>": 30308, "boasts</w>": 30309, "loneliness</w>": 30310, "gore": 30311, "transforms</w>": 30312, "alumna</w>": 30313, "@@</w>": 30314, "rappers</w>": 30315, "nehru</w>": 30316, "caro": 30317, "himalayan</w>": 30318, "wearables</w>": 30319, "geh": 30320, "peppermint</w>": 30321, "redevelopment</w>": 30322, "flamingo</w>": 30323, "cosby</w>": 30324, "bigbaldhead</w>": 30325, "agri</w>": 30326, "barefoot</w>": 30327, "scopes</w>": 30328, "regram</w>": 30329, "ghana": 30330, "ðŁİ«</w>": 30331, "iheart": 30332, "sadie</w>": 30333, "carrie": 30334, "microbial</w>": 30335, "kuala</w>": 30336, "skater</w>": 30337, "querque</w>": 30338, "âĻ©": 30339, "genres</w>": 30340, "reasoning</w>": 30341, "chased</w>": 30342, "aso": 30343, "slipped</w>": 30344, "encan": 30345, "vamos</w>": 30346, "kers": 30347, "adverse</w>": 30348, "moil</w>": 30349, "commodities</w>": 30350, "withyou</w>": 30351, "silent": 30352, "hype": 30353, "ande</w>": 30354, "amination</w>": 30355, "whispe": 30356, "litz</w>": 30357, "âļ½ï¸ıâļ½ï¸ı": 30358, "riff</w>": 30359, "ppy": 30360, "lambs</w>": 30361, "ganesh</w>": 30362, "absent</w>": 30363, "regulator</w>": 30364, "marseille</w>": 30365, "enroll</w>": 30366, "parcel</w>": 30367, "wap</w>": 30368, "byrd</w>": 30369, "ðŁĩŃ": 30370, "tuber": 30371, "countrymusic</w>": 30372, "parl": 30373, "controllers</w>": 30374, "responsibilities</w>": 30375, "wey": 30376, "chate": 30377, "montenegro</w>": 30378, "chico</w>": 30379, "milan": 30380, "lms</w>": 30381, "trainees</w>": 30382, "appropriately</w>": 30383, "uncertain</w>": 30384, "poppies</w>": 30385, "edsheeran</w>": 30386, "nutritious</w>": 30387, "garo</w>": 30388, "deutsch": 30389, "awesome": 30390, "ãĥ¼</w>": 30391, "comfortably</w>": 30392, "landmarks</w>": 30393, "eti</w>": 30394, "reusable</w>": 30395, "danielle": 30396, "rosal": 30397, "coles</w>": 30398, "justic": 30399, "ccs</w>": 30400, "fanny</w>": 30401, "nim": 30402, "mcu</w>": 30403, "clinch</w>": 30404, "atene": 30405, "merge</w>": 30406, "imdb</w>": 30407, "anglo</w>": 30408, "uccino</w>": 30409, "panini</w>": 30410, "annot": 30411, "burberry</w>": 30412, "feature": 30413, "predicting</w>": 30414, "fashionista</w>": 30415, "sask</w>": 30416, "imaginary</w>": 30417, "mmo</w>": 30418, "southsudan</w>": 30419, "spear</w>": 30420, "hubble</w>": 30421, "jointhe": 30422, "coyotes</w>": 30423, "sligo</w>": 30424, "kodak</w>": 30425, "sitcom</w>": 30426, "polaroid</w>": 30427, "rooted</w>": 30428, "corrup": 30429, "ðŁĻĮðŁĻĮ</w>": 30430, "brisban": 30431, "atz</w>": 30432, "ahl</w>": 30433, "remy</w>": 30434, "talent": 30435, "avalon</w>": 30436, "rada</w>": 30437, "pauline</w>": 30438, "locomotive</w>": 30439, "goons</w>": 30440, "nemo</w>": 30441, "maserati</w>": 30442, "icu</w>": 30443, "stutt": 30444, "historically</w>": 30445, "smb</w>": 30446, "presby": 30447, "avoid": 30448, "sooners</w>": 30449, "rhinestone</w>": 30450, "wad</w>": 30451, "rising": 30452, "trot</w>": 30453, "modes</w>": 30454, "regent</w>": 30455, "optimize</w>": 30456, "reece</w>": 30457, "smu</w>": 30458, "verti": 30459, "newyorkcity</w>": 30460, "cortez</w>": 30461, "rac</w>": 30462, "incase</w>": 30463, "sinc</w>": 30464, "fielding</w>": 30465, "etta</w>": 30466, "tiffany": 30467, "almonds</w>": 30468, "saddle": 30469, "krat": 30470, "matter": 30471, "glow": 30472, "starving</w>": 30473, "glo</w>": 30474, "crappy</w>": 30475, "slur": 30476, "std</w>": 30477, "monitors</w>": 30478, "receipt</w>": 30479, "maymayentrata</w>": 30480, "mcil": 30481, "unis": 30482, "rainbows</w>": 30483, "caldwell</w>": 30484, "pacquiao</w>": 30485, "jop": 30486, "afe": 30487, "hook": 30488, "essen</w>": 30489, "wizard": 30490, "median</w>": 30491, "flaws</w>": 30492, "coms</w>": 30493, "âĿĦ</w>": 30494, "ingh</w>": 30495, "haynes</w>": 30496, "antonio": 30497, "templates</w>": 30498, "outer": 30499, "naw</w>": 30500, "cardigan</w>": 30501, "belgrade</w>": 30502, "ðŁĴī</w>": 30503, "homo</w>": 30504, "aise</w>": 30505, "ropes</w>": 30506, "nove": 30507, "whatyou": 30508, "trigge": 30509, "conception</w>": 30510, "adukone</w>": 30511, "nadi": 30512, "friars</w>": 30513, "swer": 30514, "adjusted</w>": 30515, "hotline</w>": 30516, "sanity</w>": 30517, "kaur</w>": 30518, "downloading</w>": 30519, "cgi</w>": 30520, "tenor</w>": 30521, "ethnic": 30522, "appalach": 30523, "à¸¸</w>": 30524, "pag</w>": 30525, "golds": 30526, "onset</w>": 30527, "investigator</w>": 30528, "cartel</w>": 30529, "peacefully</w>": 30530, "jarrett</w>": 30531, "catalan</w>": 30532, "polio</w>": 30533, "num</w>": 30534, "frustration</w>": 30535, "dharma</w>": 30536, "mylife</w>": 30537, "âľĮðŁı»</w>": 30538, "aberdeen": 30539, "musa</w>": 30540, "binder</w>": 30541, "sparkly</w>": 30542, "fleeing</w>": 30543, "instinct</w>": 30544, "coping</w>": 30545, "dominance</w>": 30546, "illers</w>": 30547, "era": 30548, "uconn</w>": 30549, "looms</w>": 30550, "livingston</w>": 30551, "gali</w>": 30552, "hes": 30553, "cma": 30554, "bela</w>": 30555, "seley</w>": 30556, "monk": 30557, "lach": 30558, "marx": 30559, "Â´": 30560, "merica</w>": 30561, "womanin": 30562, "essex": 30563, "raina</w>": 30564, "jimi</w>": 30565, "neptune</w>": 30566, "zack": 30567, "chinese": 30568, "martins</w>": 30569, "chandelier</w>": 30570, "hern": 30571, "withus</w>": 30572, "earl": 30573, "asphalt</w>": 30574, "modules</w>": 30575, "stp</w>": 30576, "ulla</w>": 30577, "psychiatric</w>": 30578, "mileage</w>": 30579, "captivating</w>": 30580, "sider</w>": 30581, "mento</w>": 30582, "mort</w>": 30583, "trance": 30584, "talbot</w>": 30585, "abby": 30586, "ìĥ": 30587, "âľĮðŁı¼</w>": 30588, "jak</w>": 30589, "dawn": 30590, "turnup</w>": 30591, "screwed</w>": 30592, "feds</w>": 30593, "blueprint</w>": 30594, "ðŁĴĸðŁĴĸ</w>": 30595, "harsh": 30596, "eros</w>": 30597, "insomnia</w>": 30598, "bankers</w>": 30599, "taemin</w>": 30600, "misconduct</w>": 30601, "humber": 30602, "gidi</w>": 30603, "eduardo</w>": 30604, "cona</w>": 30605, "muscular</w>": 30606, "consuming</w>": 30607, "rash</w>": 30608, "donnie</w>": 30609, "dipped</w>": 30610, "collie</w>": 30611, "samuel": 30612, "meltdown</w>": 30613, "ðŁĺįðŁĺįðŁĺį": 30614, "mez": 30615, "examining</w>": 30616, "schwartz</w>": 30617, "pristine</w>": 30618, "ðŁĲĿ": 30619, "veit</w>": 30620, "fulfilling</w>": 30621, "anesthe": 30622, "guesses</w>": 30623, "draft": 30624, "somme</w>": 30625, "solid": 30626, "pational</w>": 30627, "hoped</w>": 30628, "evolutionary</w>": 30629, "aller</w>": 30630, "entertained</w>": 30631, "slips</w>": 30632, "ludwig</w>": 30633, "concludes</w>": 30634, "sensible</w>": 30635, "bonnet</w>": 30636, "craze</w>": 30637, "tras</w>": 30638, "hazards</w>": 30639, "constantine</w>": 30640, "edics</w>": 30641, "startrek": 30642, "toc</w>": 30643, "occupational</w>": 30644, "incheon</w>": 30645, "deepikapadukone</w>": 30646, "pizzas</w>": 30647, "newcomer</w>": 30648, "depart</w>": 30649, "oppression</w>": 30650, "ebony</w>": 30651, "fossils</w>": 30652, "trojan</w>": 30653, "elen": 30654, "steaks</w>": 30655, "khou</w>": 30656, "positioning</w>": 30657, "ugby</w>": 30658, "redcross</w>": 30659, "akh</w>": 30660, "dolce</w>": 30661, "usmnt</w>": 30662, "ppen</w>": 30663, "dilig": 30664, "mavs</w>": 30665, "caller</w>": 30666, "costello</w>": 30667, "âĽĦ": 30668, "dyn</w>": 30669, "things": 30670, "rhinos</w>": 30671, "axi": 30672, "sarkar</w>": 30673, "convocation</w>": 30674, "atters</w>": 30675, "ssss": 30676, "fungus</w>": 30677, "eugen": 30678, "russo</w>": 30679, "squat</w>": 30680, "wsb": 30681, "elion</w>": 30682, "williamsburg</w>": 30683, "soff</w>": 30684, "deficiency</w>": 30685, "bearer</w>": 30686, "okin": 30687, "keystone</w>": 30688, "twain</w>": 30689, "calming</w>": 30690, "breakable</w>": 30691, "wares</w>": 30692, "horseracing</w>": 30693, "combs</w>": 30694, "bunting</w>": 30695, "uit": 30696, "tland</w>": 30697, "ðŁĴĻðŁĴĻðŁĴĻ</w>": 30698, "gastron": 30699, "sabot": 30700, "ickers</w>": 30701, "commissioners</w>": 30702, "senate": 30703, "iiot</w>": 30704, "athena</w>": 30705, "nitrogen</w>": 30706, "antony</w>": 30707, "erotic</w>": 30708, "dialo": 30709, "missou": 30710, "hypocr": 30711, "âľĪ</w>": 30712, "kaepernick</w>": 30713, "canv": 30714, "droo": 30715, "cleveland": 30716, "osh": 30717, "monsta</w>": 30718, "stefano</w>": 30719, "^)</w>": 30720, "shul": 30721, "poison": 30722, "hae": 30723, "commercials</w>": 30724, "maul": 30725, "nitro</w>": 30726, "coworker</w>": 30727, "aloe</w>": 30728, "vapor</w>": 30729, "tents</w>": 30730, "russian": 30731, "quid</w>": 30732, "questionable</w>": 30733, "midget</w>": 30734, "poker": 30735, "girlfriends</w>": 30736, "sinthe": 30737, "eritrea</w>": 30738, "tenure</w>": 30739, "deposits</w>": 30740, "buckeyes</w>": 30741, "spotter</w>": 30742, "theodore</w>": 30743, "trinity": 30744, "joaquin</w>": 30745, "ucci</w>": 30746, "followthe": 30747, "cafc</w>": 30748, "mpa</w>": 30749, "ðŁĲ»": 30750, "plotting</w>": 30751, "domino</w>": 30752, "taek": 30753, "sionally</w>": 30754, "dicaprio</w>": 30755, "pap</w>": 30756, "carmel": 30757, "iger": 30758, "btcc</w>": 30759, "bethle": 30760, "wwwbigbaldhead</w>": 30761, "foodie": 30762, "baghdad</w>": 30763, "masonry</w>": 30764, "offended</w>": 30765, "à·": 30766, "à¸ģ</w>": 30767, "scro": 30768, "verses</w>": 30769, "orient</w>": 30770, "arches</w>": 30771, "piyu": 30772, "knowyour": 30773, "gree</w>": 30774, "takers</w>": 30775, "guard": 30776, "dishon": 30777, "bucketlist</w>": 30778, "bhafc</w>": 30779, "wardly</w>": 30780, "ðŁİīðŁİĬ</w>": 30781, "leighton</w>": 30782, "pew</w>": 30783, "stray": 30784, "assaulted</w>": 30785, "inhal": 30786, "lyfe</w>": 30787, "amarketing</w>": 30788, "lx</w>": 30789, "katz</w>": 30790, "ubuntu</w>": 30791, "meo</w>": 30792, "cartoonist</w>": 30793, "turnover</w>": 30794, "miz</w>": 30795, "dislike</w>": 30796, "mullen</w>": 30797, "mof": 30798, "bland</w>": 30799, "hides</w>": 30800, "emerges</w>": 30801, "chorizo</w>": 30802, "trustee</w>": 30803, "mahog": 30804, "lansing</w>": 30805, "paralympic</w>": 30806, "faint</w>": 30807, "fauna</w>": 30808, "chal</w>": 30809, "snar": 30810, "cath</w>": 30811, "benton</w>": 30812, "castillo</w>": 30813, "slippery</w>": 30814, "apricot</w>": 30815, "oecd</w>": 30816, "baro</w>": 30817, "lz</w>": 30818, "heming": 30819, "clowns</w>": 30820, "coworkers</w>": 30821, "peruvian</w>": 30822, "commuters</w>": 30823, "yell</w>": 30824, "ðŁļ´": 30825, "undering</w>": 30826, "vj</w>": 30827, "ttp</w>": 30828, "flipk": 30829, "wana</w>": 30830, "socent</w>": 30831, "ĤâĸĤâĸ": 30832, "à¤Ĥ</w>": 30833, "oosa</w>": 30834, "jagger</w>": 30835, "dism": 30836, "eless</w>": 30837, "dham</w>": 30838, "calif</w>": 30839, "aofficial</w>": 30840, "eclip": 30841, "harrogate</w>": 30842, "grapp": 30843, "comrade</w>": 30844, "ntr</w>": 30845, "concentrate</w>": 30846, "thighs</w>": 30847, "bitcoin": 30848, "belarus</w>": 30849, "ëĵ": 30850, "enduring</w>": 30851, "nowwatching</w>": 30852, "industrial": 30853, "pip</w>": 30854, "aron": 30855, "arat</w>": 30856, "Â®": 30857, "whitby</w>": 30858, "ooooooo</w>": 30859, "saree</w>": 30860, "ticals</w>": 30861, "misleading</w>": 30862, "yoon": 30863, "years": 30864, "sleigh</w>": 30865, "romanian</w>": 30866, "scissors</w>": 30867, "vampires</w>": 30868, "acup": 30869, "abba</w>": 30870, "thweeksary</w>": 30871, "centri": 30872, "flye": 30873, "uo": 30874, "cbi</w>": 30875, "buena</w>": 30876, "sind</w>": 30877, "marino</w>": 30878, "burr</w>": 30879, "rebuilding</w>": 30880, "à¤²</w>": 30881, "anniversaire</w>": 30882, "acca</w>": 30883, "ðŁĴĢðŁĴĢ": 30884, "getting": 30885, "tulips</w>": 30886, "wolfpack</w>": 30887, "âľįï¸ı</w>": 30888, "morethan": 30889, "takin</w>": 30890, "ðŁ¤ĺðŁı»</w>": 30891, "ube</w>": 30892, "monic": 30893, "doubts</w>": 30894, "mower</w>": 30895, "cobalt</w>": 30896, "donne": 30897, "speculation</w>": 30898, "arguably</w>": 30899, "kaku</w>": 30900, "https</w>": 30901, "prosecution</w>": 30902, "dinah</w>": 30903, "stamatic</w>": 30904, "disclosed</w>": 30905, "beverly": 30906, "flwx</w>": 30907, "crabs</w>": 30908, "extraordinaire</w>": 30909, "warmest</w>": 30910, "imperi": 30911, "ologists</w>": 30912, "traces</w>": 30913, "parc</w>": 30914, "lakeside</w>": 30915, "amr</w>": 30916, "teri</w>": 30917, "hourly</w>": 30918, "domination</w>": 30919, "arrow": 30920, "shrewsbury</w>": 30921, "ancestry</w>": 30922, "wrangler</w>": 30923, "triggered</w>": 30924, "pensac": 30925, "rooster</w>": 30926, "survives</w>": 30927, "aon": 30928, "boko</w>": 30929, "valor</w>": 30930, "loveis": 30931, "lag": 30932, "pey</w>": 30933, "focal</w>": 30934, "outlaws</w>": 30935, "blanc": 30936, "articho": 30937, "wits</w>": 30938, "marshall": 30939, "diego": 30940, "supportsmall": 30941, "uca</w>": 30942, "sah": 30943, "jeet</w>": 30944, "synago": 30945, "governing</w>": 30946, "ðŁĴ¬</w>": 30947, "salads</w>": 30948, "create": 30949, "miriam</w>": 30950, "censored</w>": 30951, "amide</w>": 30952, "nou</w>": 30953, "zeta</w>": 30954, "allegiance</w>": 30955, "*)</w>": 30956, "blm</w>": 30957, "rican</w>": 30958, "pastors</w>": 30959, "olympus</w>": 30960, "bloc</w>": 30961, "whirl": 30962, "starry</w>": 30963, "prone</w>": 30964, "yk</w>": 30965, "pne": 30966, "congratulating</w>": 30967, "bev</w>": 30968, "sober": 30969, "loveisland</w>": 30970, "sair</w>": 30971, "aning</w>": 30972, "tutorials</w>": 30973, "qe</w>": 30974, "lund": 30975, "inist</w>": 30976, "clever": 30977, "taxpayer</w>": 30978, "aliz": 30979, "wrench</w>": 30980, "ddling</w>": 30981, "capri</w>": 30982, "hpa</w>": 30983, "ðŁı»âĢįâĻĤï¸ı</w>": 30984, "naj": 30985, "oj": 30986, "futuristic</w>": 30987, "jellyfish</w>": 30988, "ðŁĶ¥ðŁĶ¥ðŁĶ¥ðŁĶ¥</w>": 30989, "celery</w>": 30990, "plank</w>": 30991, "fila</w>": 30992, "neme": 30993, "unhealthy</w>": 30994, "lections</w>": 30995, "ðŁ§¡": 30996, "ritchie</w>": 30997, "nws</w>": 30998, "mikha": 30999, "wonderwoman</w>": 31000, "âĢİ</w>": 31001, "hipstamatic</w>": 31002, "kag</w>": 31003, "ðŁĴľðŁĴľðŁĴľ</w>": 31004, "poultry</w>": 31005, "mow": 31006, "words": 31007, "loff</w>": 31008, "ðŁ¤£ðŁ¤£</w>": 31009, "relatable</w>": 31010, "remixes</w>": 31011, "kenyatta</w>": 31012, "kem": 31013, "resigned</w>": 31014, "fod": 31015, "straigh": 31016, "jlo</w>": 31017, "hutch": 31018, "boxers</w>": 31019, "colleen</w>": 31020, "mags</w>": 31021, "instructional</w>": 31022, "kol</w>": 31023, "attracts</w>": 31024, "prag": 31025, "accountant</w>": 31026, "goggles</w>": 31027, "bru</w>": 31028, "thole</w>": 31029, "marrow</w>": 31030, "leuke": 31031, "octo": 31032, "ponds</w>": 31033, "bubbly</w>": 31034, "heist</w>": 31035, "ìĹĳ": 31036, "imp</w>": 31037, "ahar": 31038, "haunt</w>": 31039, "hallmark": 31040, "psych": 31041, "kkkkkkkk": 31042, "columb": 31043, "jumpsuit</w>": 31044, "costco</w>": 31045, "sidelines</w>": 31046, "aggies</w>": 31047, "overturned</w>": 31048, "nib</w>": 31049, "keychain</w>": 31050, "fuk</w>": 31051, "faf": 31052, "miam": 31053, "assistants</w>": 31054, "cycled</w>": 31055, "rider": 31056, "dammit</w>": 31057, "redwings</w>": 31058, "mages</w>": 31059, "kins": 31060, "ìĤ": 31061, "hod": 31062, "sont</w>": 31063, "caroline": 31064, "\"'</w>": 31065, "cule</w>": 31066, "braid</w>": 31067, "felony</w>": 31068, "arities</w>": 31069, "rutherford</w>": 31070, "depiction</w>": 31071, "isabelle</w>": 31072, "roach</w>": 31073, "kday</w>": 31074, "fifthharmony</w>": 31075, "emy": 31076, "ligam": 31077, "barista</w>": 31078, "albuquerque</w>": 31079, "gross": 31080, "ðŁįº": 31081, "ooks</w>": 31082, "ðŁĳ¼</w>": 31083, "duncan": 31084, "tryin</w>": 31085, "jags</w>": 31086, "gould</w>": 31087, "litho": 31088, "âģ£": 31089, "Ð°Ð": 31090, "sammy": 31091, "tung</w>": 31092, "casser": 31093, "apolo": 31094, "aaaaa</w>": 31095, "mang</w>": 31096, "asics</w>": 31097, "shen</w>": 31098, "pye": 31099, "turbul": 31100, "ssp</w>": 31101, "saintsfc</w>": 31102, "onlin": 31103, "nanny</w>": 31104, "hester</w>": 31105, "doz</w>": 31106, "à¸Ķ": 31107, "thread": 31108, "rents</w>": 31109, "khand</w>": 31110, "ðŁĴªðŁı½</w>": 31111, "unconditional</w>": 31112, "robson</w>": 31113, "carre": 31114, "phon</w>": 31115, "sacrificed</w>": 31116, "Â£": 31117, "autos</w>": 31118, "parker": 31119, "oca</w>": 31120, "login</w>": 31121, "keegan</w>": 31122, "hardcover</w>": 31123, "doughnuts</w>": 31124, "ðŁĮİ": 31125, "spitfire</w>": 31126, "refreshments</w>": 31127, "saskatoon</w>": 31128, "commodore</w>": 31129, "jf": 31130, "rubber": 31131, "halamadrid</w>": 31132, "childcare</w>": 31133, "strada</w>": 31134, "iom</w>": 31135, "rik": 31136, "dakar</w>": 31137, "thermom": 31138, "cropped</w>": 31139, "garu</w>": 31140, "alik</w>": 31141, "veni</w>": 31142, "ift": 31143, "sika</w>": 31144, "rituals</w>": 31145, "zul": 31146, "ech</w>": 31147, "Â©": 31148, "sudan": 31149, "lland": 31150, "ime</w>": 31151, "docker</w>": 31152, "ì¤": 31153, "feared</w>": 31154, "fao</w>": 31155, "walter": 31156, "nog</w>": 31157, "mutuals</w>": 31158, "lh</w>": 31159, "align</w>": 31160, "monia</w>": 31161, "conceptart</w>": 31162, "ðŁĻıðŁı¼": 31163, "scoe</w>": 31164, "competence</w>": 31165, "swine</w>": 31166, "lyme</w>": 31167, "launch": 31168, "greener</w>": 31169, "abstractart</w>": 31170, "inquis": 31171, "granada</w>": 31172, "gaelic</w>": 31173, "fluff</w>": 31174, "dbacks</w>": 31175, "graveyard</w>": 31176, "babe": 31177, "academic": 31178, "adventurous</w>": 31179, "johann</w>": 31180, "~!</w>": 31181, "bibi</w>": 31182, "|#</w>": 31183, "plings</w>": 31184, "getty": 31185, "asb</w>": 31186, "âĿ¤ï¸ı@</w>": 31187, "staff": 31188, "religions</w>": 31189, "bangor</w>": 31190, "worldbookday</w>": 31191, "megh": 31192, "devin": 31193, "ashore</w>": 31194, "meridian</w>": 31195, "github</w>": 31196, "quiz": 31197, "allstars</w>": 31198, "bestest</w>": 31199, "irresi": 31200, "acker": 31201, "dote</w>": 31202, "warrington</w>": 31203, "polly</w>": 31204, "neworleans</w>": 31205, "crou": 31206, "wigs</w>": 31207, "chey</w>": 31208, "smithsonian</w>": 31209, "lasag": 31210, "detour</w>": 31211, "boris": 31212, "straps</w>": 31213, "mariah": 31214, "intentionally</w>": 31215, "koh</w>": 31216, "ðŁį¸</w>": 31217, "ssian</w>": 31218, "marissa</w>": 31219, "coral": 31220, "episcopal</w>": 31221, "casualty</w>": 31222, "tomo": 31223, "supplychain</w>": 31224, "samp": 31225, "ongo</w>": 31226, "roo</w>": 31227, "caviar</w>": 31228, "pfw</w>": 31229, "claudio</w>": 31230, "buffalo": 31231, "sations</w>": 31232, "matty": 31233, "snapback</w>": 31234, "lds</w>": 31235, "alarms</w>": 31236, "matte": 31237, "âĺĶï¸ı</w>": 31238, "conditioner</w>": 31239, "dors</w>": 31240, "hex</w>": 31241, "fizz</w>": 31242, "astri": 31243, "sussex": 31244, "security": 31245, "qaeda</w>": 31246, "allstar": 31247, "cocacola</w>": 31248, "asone</w>": 31249, "clicks</w>": 31250, "scans</w>": 31251, "mute</w>": 31252, "heavier</w>": 31253, "ðŁİ§": 31254, "âĺŀ</w>": 31255, "lvl</w>": 31256, "bookboost</w>": 31257, "youtube": 31258, "flashes</w>": 31259, "fjor": 31260, "csu</w>": 31261, "explode</w>": 31262, "dodge": 31263, "cairn": 31264, "gonzales</w>": 31265, "thill</w>": 31266, "pelle": 31267, "hartley</w>": 31268, "renewable": 31269, "retin": 31270, "estre": 31271, "costarica</w>": 31272, "shipyard</w>": 31273, "ncfc</w>": 31274, "priya</w>": 31275, "aghan</w>": 31276, "anath</w>": 31277, "plugin</w>": 31278, "corey": 31279, "rebound</w>": 31280, "oru</w>": 31281, "katrin": 31282, "hormone</w>": 31283, "gim": 31284, "mahindra</w>": 31285, "ssus</w>": 31286, "parkland</w>": 31287, "harper": 31288, "fantastic": 31289, "inferno</w>": 31290, "epilo": 31291, "wrestling": 31292, "fect</w>": 31293, "cit</w>": 31294, "acoun": 31295, "tossed</w>": 31296, "monumental</w>": 31297, "chartered</w>": 31298, "bust": 31299, "petra</w>": 31300, "âĮļ": 31301, "wildflowerhour</w>": 31302, "sweaters</w>": 31303, "*.</w>": 31304, "bler": 31305, "atech</w>": 31306, "gowan</w>": 31307, "demographic</w>": 31308, "bral</w>": 31309, "suicide": 31310, "renovations</w>": 31311, "vuel": 31312, "sinister</w>": 31313, "armani</w>": 31314, "misogy": 31315, "pharrell</w>": 31316, "naps</w>": 31317, "uniting</w>": 31318, "crusaders</w>": 31319, "corgi</w>": 31320, "insured</w>": 31321, "thani</w>": 31322, "noor</w>": 31323, "gq</w>": 31324, "dada</w>": 31325, "bicycles</w>": 31326, "snuggle</w>": 31327, "schan": 31328, "tenberg</w>": 31329, "ssal": 31330, "femme</w>": 31331, "boil</w>": 31332, "½ï¸ı</w>": 31333, "reap</w>": 31334, "occurring</w>": 31335, "hussein</w>": 31336, "divid": 31337, "stoke": 31338, "shalom</w>": 31339, "naia</w>": 31340, "olic</w>": 31341, "frustrating</w>": 31342, "Ùĩ</w>": 31343, "igs</w>": 31344, "grover</w>": 31345, "scenarios</w>": 31346, "nds</w>": 31347, "brutality</w>": 31348, "medalli": 31349, "buon": 31350, "sass</w>": 31351, "skateboarding</w>": 31352, "onyx</w>": 31353, "lorry</w>": 31354, "nyu</w>": 31355, "gautam</w>": 31356, "mmings</w>": 31357, "gug": 31358, "endi</w>": 31359, "lothian</w>": 31360, "commando</w>": 31361, "chalk": 31362, "phora</w>": 31363, "assessing</w>": 31364, "tigh": 31365, "crunchy</w>": 31366, "aday": 31367, "isl</w>": 31368, "ciara</w>": 31369, "pilgrims</w>": 31370, "kamal": 31371, "pto": 31372, "britanni": 31373, "tani": 31374, "smc": 31375, "lure</w>": 31376, "appstore</w>": 31377, "aby</w>": 31378, "golfing</w>": 31379, "clc</w>": 31380, "fau</w>": 31381, "anas": 31382, "shutting</w>": 31383, "regulated</w>": 31384, "carnage</w>": 31385, "scowboys</w>": 31386, "allenge</w>": 31387, "cma</w>": 31388, "humboldt</w>": 31389, "relle</w>": 31390, "kumb": 31391, "heri</w>": 31392, "refinery</w>": 31393, "soundcheck</w>": 31394, "dwayne</w>": 31395, "bosnia</w>": 31396, "isp": 31397, "thealth</w>": 31398, "anniv</w>": 31399, "relevance</w>": 31400, "mya</w>": 31401, "baggage</w>": 31402, "dread</w>": 31403, "sbc</w>": 31404, "thed</w>": 31405, "buh</w>": 31406, "hijab</w>": 31407, "loid</w>": 31408, "kew</w>": 31409, "cte</w>": 31410, "respect": 31411, "lovelies</w>": 31412, "cubes</w>": 31413, "celebrate": 31414, "dirt": 31415, "savers</w>": 31416, "_,</w>": 31417, "garment</w>": 31418, "pulitzer</w>": 31419, "masjid</w>": 31420, "beatport</w>": 31421, "alarts</w>": 31422, "encryption</w>": 31423, "sner</w>": 31424, "pleads</w>": 31425, "foundry</w>": 31426, "symmetry</w>": 31427, "rumi</w>": 31428, "birthplace</w>": 31429, "scallops</w>": 31430, "supple": 31431, "pivotal</w>": 31432, "tati": 31433, "node": 31434, "sod</w>": 31435, "proxim": 31436, "trics</w>": 31437, "coldest</w>": 31438, "brent": 31439, "mandu</w>": 31440, "clair": 31441, "each": 31442, "andalu": 31443, "hiddleston</w>": 31444, "ðŁĲº</w>": 31445, "melts</w>": 31446, "vance</w>": 31447, "pinn": 31448, "sements</w>": 31449, "screened</w>": 31450, "sachs</w>": 31451, "obl": 31452, "icha": 31453, "âĺĺï¸ı</w>": 31454, "schoolers</w>": 31455, "healed</w>": 31456, "logged</w>": 31457, "ðŁ¤ĺðŁı¼</w>": 31458, "icus</w>": 31459, "boredom</w>": 31460, "bish</w>": 31461, "bffs</w>": 31462, "talking": 31463, "suresh</w>": 31464, "hookem</w>": 31465, "deon": 31466, "defl": 31467, "eileen</w>": 31468, "ðŁįķ": 31469, "womenintech</w>": 31470, "risotto</w>": 31471, "ranger": 31472, "advertise</w>": 31473, "à¸ģà¸": 31474, "telly</w>": 31475, "lago</w>": 31476, "dartmoor</w>": 31477, "dong</w>": 31478, "skates</w>": 31479, "logo": 31480, "unner</w>": 31481, "mailbox</w>": 31482, "masala</w>": 31483, "looooo": 31484, "amethyst</w>": 31485, "chewing</w>": 31486, "cbb</w>": 31487, "australians</w>": 31488, "rcmp</w>": 31489, "gameart</w>": 31490, "#...</w>": 31491, "korn</w>": 31492, "extremism</w>": 31493, "fruitful</w>": 31494, "ancient": 31495, "pubg</w>": 31496, "polite</w>": 31497, "whit</w>": 31498, "murals</w>": 31499, "mgr</w>": 31500, "lineman</w>": 31501, "davao</w>": 31502, "stems</w>": 31503, "tennis": 31504, "avage</w>": 31505, "tupac</w>": 31506, "gigantic</w>": 31507, "hsbc</w>": 31508, "autobiography</w>": 31509, "upthe": 31510, "à¸µà¹Ī</w>": 31511, "regal": 31512, "figuring</w>": 31513, "kul</w>": 31514, "missy</w>": 31515, "hoop": 31516, "gras": 31517, "forums</w>": 31518, "backlash</w>": 31519, "abducted</w>": 31520, "pnw</w>": 31521, "minic": 31522, "butt</w>": 31523, "bottoms</w>": 31524, "aton": 31525, "veng</w>": 31526, "ðŁĮı</w>": 31527, "delaney</w>": 31528, "prabhu</w>": 31529, "fanclub</w>": 31530, "overhaul</w>": 31531, "healthye": 31532, "syno": 31533, "aaf</w>": 31534, "renamed</w>": 31535, "kimi</w>": 31536, "uncle": 31537, "mancity</w>": 31538, "seu</w>": 31539, "quanti": 31540, "esteem</w>": 31541, "umin</w>": 31542, "enzo</w>": 31543, "melvin</w>": 31544, "undergo</w>": 31545, "jhar": 31546, "farah</w>": 31547, "coasters</w>": 31548, "humphrey</w>": 31549, "mhz</w>": 31550, "childrens": 31551, "^.": 31552, "dhi": 31553, "disruptive</w>": 31554, "integrating</w>": 31555, "rnb</w>": 31556, "oversized</w>": 31557, "aide": 31558, "neau</w>": 31559, "documentation</w>": 31560, "ðŁĳĢðŁĳĢ</w>": 31561, "palo</w>": 31562, "hearth": 31563, "riyad": 31564, "punctu": 31565, "abcnews</w>": 31566, "secures</w>": 31567, "boyband</w>": 31568, "birch": 31569, "juco</w>": 31570, "traff": 31571, "legislators</w>": 31572, "baya</w>": 31573, "ãĤ¯": 31574, "noises</w>": 31575, "collects</w>": 31576, "swarm</w>": 31577, "kner</w>": 31578, "bishops</w>": 31579, "sturgeon</w>": 31580, "snapping</w>": 31581, "mol</w>": 31582, "freaky</w>": 31583, "chairperson</w>": 31584, "trop</w>": 31585, "lynch": 31586, "carcin": 31587, "artsy</w>": 31588, "esto": 31589, "chai": 31590, "flur": 31591, "invali": 31592, "sausages</w>": 31593, "imel": 31594, "jor</w>": 31595, "funfact</w>": 31596, "witter</w>": 31597, "punished</w>": 31598, "acons</w>": 31599, "hya</w>": 31600, "reversi": 31601, "emc</w>": 31602, "diffu": 31603, "zx</w>": 31604, "spaw": 31605, "clad</w>": 31606, "dmit": 31607, "holland": 31608, "fresco</w>": 31609, "payroll</w>": 31610, "abundant</w>": 31611, "stuffing</w>": 31612, "moro</w>": 31613, "cny</w>": 31614, "boycott": 31615, "wendy": 31616, "eleven": 31617, "provoc": 31618, "pilot": 31619, "trx</w>": 31620, "bead": 31621, "climateaction</w>": 31622, "rion": 31623, "assie</w>": 31624, "ìĸ": 31625, "osm": 31626, "islamic": 31627, "hoar": 31628, "goodreads</w>": 31629, "alici": 31630, "afternoons</w>": 31631, "spokesman</w>": 31632, "jolie</w>": 31633, "itas": 31634, "mascara</w>": 31635, "âĻ©âĻ«</w>": 31636, "prevail</w>": 31637, "beetroot</w>": 31638, "lujah</w>": 31639, "kli": 31640, "dodger</w>": 31641, "Â»": 31642, "rule": 31643, "ln": 31644, "scream": 31645, "hobart</w>": 31646, "colbert</w>": 31647, "rtc</w>": 31648, "erm</w>": 31649, "patro": 31650, "quoting</w>": 31651, "slive</w>": 31652, "quest": 31653, "nonfiction</w>": 31654, "seminary</w>": 31655, "prosecutors</w>": 31656, "vest": 31657, "expressway</w>": 31658, "gge</w>": 31659, "nautical</w>": 31660, "etf</w>": 31661, "ðŁİīðŁİĬ": 31662, "duration</w>": 31663, "chaired</w>": 31664, "thefilm</w>": 31665, "fabio</w>": 31666, "sheh": 31667, "cano": 31668, "ðŁĴªðŁı»": 31669, "withdraw</w>": 31670, "!:)</w>": 31671, "corpus</w>": 31672, "phenom": 31673, "yelp</w>": 31674, "lawn": 31675, "entom": 31676, "snapper</w>": 31677, "butte</w>": 31678, "pinball</w>": 31679, "proxy</w>": 31680, "libre</w>": 31681, "allevi": 31682, "nada</w>": 31683, "gabriel": 31684, "fowl</w>": 31685, "eureka</w>": 31686, "daphne</w>": 31687, "tunes": 31688, "punched</w>": 31689, "whore</w>": 31690, "jog</w>": 31691, "rential</w>": 31692, "manners</w>": 31693, "ope": 31694, "whufc</w>": 31695, "guth": 31696, "revolt</w>": 31697, "sneaker": 31698, "philharmonic</w>": 31699, "hoste": 31700, "sovereignty</w>": 31701, "ðŁĻıðŁĻıðŁĻı</w>": 31702, "fishing": 31703, "sciart</w>": 31704, "feta</w>": 31705, "ipp": 31706, "dumping</w>": 31707, "kelown": 31708, "giri</w>": 31709, "digits</w>": 31710, "salu": 31711, "sanjay": 31712, "tweeters</w>": 31713, "spas": 31714, "colchester</w>": 31715, "scab": 31716, "madd": 31717, "à¹Ħà¸": 31718, "Äĩ</w>": 31719, "geddon</w>": 31720, "marchfor": 31721, "dop</w>": 31722, "maureen</w>": 31723, "unplugged</w>": 31724, "dido</w>": 31725, "fashionblogger</w>": 31726, "upa</w>": 31727, "mexic": 31728, "tary": 31729, "polye": 31730, "jameson</w>": 31731, "vt": 31732, "grinder</w>": 31733, "maddy</w>": 31734, "consultancy</w>": 31735, "¬ë": 31736, "leagueoflegends</w>": 31737, "accents</w>": 31738, "umni</w>": 31739, "janeiro</w>": 31740, "tuss": 31741, "hens</w>": 31742, "amplifier</w>": 31743, "toshi": 31744, "prettier</w>": 31745, "prevents</w>": 31746, "newtown</w>": 31747, "redwood</w>": 31748, "vantage</w>": 31749, "ballard</w>": 31750, "artof": 31751, "ashe</w>": 31752, "asion</w>": 31753, "lacey</w>": 31754, "apat": 31755, "grove": 31756, "à¸Ħ</w>": 31757, "rwand": 31758, "realtors</w>": 31759, "traitor</w>": 31760, "bedding</w>": 31761, "Ã¶r": 31762, "zion": 31763, "flashing</w>": 31764, "campan": 31765, "boomer</w>": 31766, "secretariat</w>": 31767, "abol": 31768, "litigation</w>": 31769, "contamination</w>": 31770, "sedly</w>": 31771, "shredded</w>": 31772, "infor": 31773, "doherty</w>": 31774, "benchmark</w>": 31775, "roche</w>": 31776, "skateboard</w>": 31777, "shovel</w>": 31778, "izz": 31779, "topper</w>": 31780, "oster": 31781, "labyrin": 31782, "autum": 31783, "kong": 31784, "hummus</w>": 31785, "viz</w>": 31786, "technews</w>": 31787, "klaus</w>": 31788, "amusing</w>": 31789, "socialmediamarketing</w>": 31790, "ides</w>": 31791, "castell": 31792, "stee": 31793, "underestimate</w>": 31794, "calab": 31795, "paign</w>": 31796, "billing</w>": 31797, "unanimously</w>": 31798, "gmb</w>": 31799, "flyfishing</w>": 31800, "hathaway</w>": 31801, "commercial": 31802, "colouring</w>": 31803, "skulls</w>": 31804, "pivot</w>": 31805, "tep</w>": 31806, "tbc</w>": 31807, "motorway</w>": 31808, "xpress</w>": 31809, "constructive</w>": 31810, "puk</w>": 31811, "underlying</w>": 31812, "kirsten</w>": 31813, "maniac</w>": 31814, "chao</w>": 31815, "sema</w>": 31816, "chiffon</w>": 31817, "ðŁĳĮðŁı»": 31818, "verona</w>": 31819, "komo</w>": 31820, "standoff</w>": 31821, "wiped</w>": 31822, "cated</w>": 31823, "blair": 31824, "workin": 31825, "msc": 31826, "bethlehem</w>": 31827, "swipe</w>": 31828, "unexpec": 31829, "pees</w>": 31830, "petri": 31831, "origami</w>": 31832, "ðŁĳħ": 31833, "mexico": 31834, "flavor": 31835, "rudd</w>": 31836, "cannabis": 31837, "maru</w>": 31838, "riddle</w>": 31839, "worshi": 31840, "silon</w>": 31841, "schat</w>": 31842, "apse</w>": 31843, "tanger": 31844, "bious</w>": 31845, "eer</w>": 31846, "questioned</w>": 31847, "ozar": 31848, "dank</w>": 31849, "anglesey</w>": 31850, "charan</w>": 31851, "baku</w>": 31852, "competen": 31853, "repri": 31854, "batter</w>": 31855, "saxon</w>": 31856, "calves</w>": 31857, "lengths</w>": 31858, "$$$</w>": 31859, "âŀ¡ï¸ı": 31860, "immersion</w>": 31861, "gaunt": 31862, "carry": 31863, "cyto": 31864, "banda</w>": 31865, "shutt": 31866, "experience": 31867, "elgin</w>": 31868, "mousse</w>": 31869, "taz</w>": 31870, "êµ": 31871, "incorrect</w>": 31872, "enz</w>": 31873, "bham": 31874, "moron</w>": 31875, "sover</w>": 31876, "arun</w>": 31877, "tipped</w>": 31878, "lable</w>": 31879, "dearly</w>": 31880, "bautista</w>": 31881, "íĻ": 31882, "mortal": 31883, "woop</w>": 31884, "dtla</w>": 31885, "shocks</w>": 31886, "davos</w>": 31887, "ðŁĵĿ": 31888, "swimwear</w>": 31889, "herman": 31890, "ðŁĳĩðŁĳĩ</w>": 31891, "zir": 31892, "neglected</w>": 31893, "graced</w>": 31894, "campuses</w>": 31895, "avs</w>": 31896, "arora</w>": 31897, "swachhb": 31898, "livepd</w>": 31899, "accra</w>": 31900, "enquiries</w>": 31901, "shooters</w>": 31902, "kurt": 31903, "vancouver": 31904, "bradley": 31905, "garda</w>": 31906, "gÃ¼": 31907, "olla</w>": 31908, "attracting</w>": 31909, "upton</w>": 31910, "newin": 31911, "lumia</w>": 31912, "furnace</w>": 31913, "evers</w>": 31914, "eon</w>": 31915, "swa</w>": 31916, "rookies</w>": 31917, "aoc</w>": 31918, "vss</w>": 31919, "brisket</w>": 31920, "torch": 31921, "yoda</w>": 31922, "heartland</w>": 31923, "taco": 31924, "phony</w>": 31925, "foodbank</w>": 31926, "abbey": 31927, "babylon</w>": 31928, "uy": 31929, "greate": 31930, "expresses</w>": 31931, "dandy</w>": 31932, "scapes</w>": 31933, "survivor": 31934, "rond": 31935, "eci": 31936, "havin</w>": 31937, "abel": 31938, "childish</w>": 31939, "torque</w>": 31940, "wavy</w>": 31941, "urself</w>": 31942, "kanyewest</w>": 31943, "yearof": 31944, "alestine</w>": 31945, "obrien</w>": 31946, "alfon": 31947, "skag": 31948, "korean": 31949, "anchorage</w>": 31950, "valeri": 31951, "dew": 31952, "ðŁİ¨": 31953, "landslide</w>": 31954, "carole</w>": 31955, "christen": 31956, "gophers</w>": 31957, "afi</w>": 31958, "priyanka</w>": 31959, "qq": 31960, "powerof": 31961, "itte</w>": 31962, "pcso</w>": 31963, "twol": 31964, "pry": 31965, "intellectu": 31966, "guerrero</w>": 31967, "piles</w>": 31968, "wishlist</w>": 31969, "wren</w>": 31970, "timetable</w>": 31971, "ëı": 31972, "prodigy</w>": 31973, "gibbons</w>": 31974, "./</w>": 31975, "neur</w>": 31976, "anzac</w>": 31977, "murray": 31978, "viest</w>": 31979, "plaster</w>": 31980, "lair</w>": 31981, "artgallery</w>": 31982, "intercontinental</w>": 31983, "gbr</w>": 31984, "bellator</w>": 31985, "namjoon</w>": 31986, "mammals</w>": 31987, "amel": 31988, "yaw": 31989, "sarasota</w>": 31990, "camar": 31991, "budding</w>": 31992, "summari": 31993, "acosta</w>": 31994, "lash": 31995, "eyou": 31996, "postgraduate</w>": 31997, "instructors</w>": 31998, "tig</w>": 31999, "constant": 32000, "werewolf</w>": 32001, "icos</w>": 32002, "clas": 32003, "glenn": 32004, "budge": 32005, "ðŁĻĤ": 32006, "erta</w>": 32007, "stains</w>": 32008, "persecution</w>": 32009, "cumbri": 32010, "och</w>": 32011, "synergy</w>": 32012, "huang</w>": 32013, "scandin": 32014, "midterms</w>": 32015, "commentator</w>": 32016, "regarded</w>": 32017, "perpetual</w>": 32018, "boiling</w>": 32019, "alp": 32020, "lange</w>": 32021, "schle": 32022, "faceli": 32023, "tweeta": 32024, "ridden</w>": 32025, "oktoberfest</w>": 32026, "charlottesville</w>": 32027, "iklan": 32028, "jou</w>": 32029, "chatham</w>": 32030, "bsc</w>": 32031, "ðŁį¦</w>": 32032, "strauss</w>": 32033, "mellow</w>": 32034, "xxxx": 32035, "happyhour</w>": 32036, "reactor</w>": 32037, "wwer": 32038, "distraction</w>": 32039, "atorial</w>": 32040, "ðŁĴªðŁı¼": 32041, "twinpeaks</w>": 32042, "fayette": 32043, "aor": 32044, "kok": 32045, "broom": 32046, "syfy</w>": 32047, "ouse": 32048, "amag": 32049, "Ø·": 32050, "ubisoft</w>": 32051, "lulu": 32052, "hallmark</w>": 32053, "stuart": 32054, "itya</w>": 32055, "sideline</w>": 32056, "vengeance</w>": 32057, "relu": 32058, "sexism</w>": 32059, "bouncing</w>": 32060, "unites</w>": 32061, "gustav": 32062, "tessa</w>": 32063, "stump</w>": 32064, "proclamation</w>": 32065, "imax</w>": 32066, "dividend</w>": 32067, "colby</w>": 32068, "ðŁįİ</w>": 32069, "playwright</w>": 32070, "unsafe</w>": 32071, "cosmo</w>": 32072, "ðŁĩ²ðŁĩ½</w>": 32073, "cupboard</w>": 32074, "constituents</w>": 32075, "anglia</w>": 32076, "rampage</w>": 32077, "ðŁĺįðŁĺįðŁĺįðŁĺįðŁĺį</w>": 32078, "thanked</w>": 32079, "takeaways</w>": 32080, "shroff</w>": 32081, "debat": 32082, "khur": 32083, "conducts</w>": 32084, "formats</w>": 32085, "à©": 32086, "portage</w>": 32087, "graphers</w>": 32088, "uten": 32089, "prem": 32090, "moines</w>": 32091, "condemns</w>": 32092, "sous</w>": 32093, "lps</w>": 32094, "fcs</w>": 32095, "dealership</w>": 32096, "leukemia</w>": 32097, "bureau": 32098, "skid</w>": 32099, "guardiola</w>": 32100, "caster": 32101, "third": 32102, "avoided</w>": 32103, "encyclo": 32104, "csr": 32105, "vixx</w>": 32106, "analyzing</w>": 32107, "shear</w>": 32108, "duluth</w>": 32109, "shapiro</w>": 32110, "chanting</w>": 32111, "stresses</w>": 32112, "asbe": 32113, "militia</w>": 32114, "ãĥª": 32115, "collin</w>": 32116, "arsene</w>": 32117, "suresh": 32118, "teachings</w>": 32119, "yixing</w>": 32120, "shill": 32121, "nudes</w>": 32122, "svu</w>": 32123, "clearwater</w>": 32124, "warped</w>": 32125, "prolife</w>": 32126, "artistson": 32127, "itu</w>": 32128, "versailles</w>": 32129, "galaxy": 32130, "axel</w>": 32131, "springst</w>": 32132, "cala</w>": 32133, "huhu</w>": 32134, "scu</w>": 32135, "commitments</w>": 32136, "exeter": 32137, "poignant</w>": 32138, "motion": 32139, "conservatory</w>": 32140, "rowdy</w>": 32141, "recalled</w>": 32142, "musk": 32143, "embelli": 32144, "sothe": 32145, "âĺĢ": 32146, "stopper</w>": 32147, "schild</w>": 32148, "tope": 32149, "elmo</w>": 32150, "ziel</w>": 32151, "jom": 32152, "barnsley</w>": 32153, "snowden</w>": 32154, "ontour</w>": 32155, "journey": 32156, "hillsborough</w>": 32157, "parole</w>": 32158, "wts</w>": 32159, "moving": 32160, "agility</w>": 32161, "tivo</w>": 32162, "ffers</w>": 32163, "kindleunlimited</w>": 32164, "gwen": 32165, "annan</w>": 32166, "ahmad": 32167, "textured</w>": 32168, "hepatitis</w>": 32169, "dram</w>": 32170, "insiders</w>": 32171, "tissues</w>": 32172, "ãĥĦ</w>": 32173, "fcbarcelona</w>": 32174, "cratic</w>": 32175, "naacp</w>": 32176, "pecan</w>": 32177, "fgm</w>": 32178, "customize</w>": 32179, "concert": 32180, "gsm</w>": 32181, "peg": 32182, "pone</w>": 32183, "justintrudeau</w>": 32184, "supercars</w>": 32185, "happyholidays</w>": 32186, "bular</w>": 32187, "adox</w>": 32188, "laptops</w>": 32189, "digitalhealth</w>": 32190, "destination": 32191, "gradually</w>": 32192, "áĥ¦</w>": 32193, "poppy": 32194, "ssl</w>": 32195, "inhibit": 32196, "starlight</w>": 32197, "offro": 32198, "gloomy</w>": 32199, "xper": 32200, "halder</w>": 32201, "implants</w>": 32202, "leto</w>": 32203, "hassel": 32204, "aas</w>": 32205, "untold</w>": 32206, "enci": 32207, "liberia</w>": 32208, "oran": 32209, "contests</w>": 32210, "ilah</w>": 32211, "smag</w>": 32212, "scout": 32213, "marianne</w>": 32214, "cryo": 32215, "scheduling</w>": 32216, "los": 32217, "kane": 32218, "stuttgart</w>": 32219, "nese</w>": 32220, "lawrence": 32221, "dain": 32222, "photom": 32223, "carou": 32224, "à¸£": 32225, "gwy": 32226, "nationaldogday</w>": 32227, "roasting</w>": 32228, "bandcamp</w>": 32229, "kentucky": 32230, "stretches</w>": 32231, "kerel</w>": 32232, "cashe": 32233, "ãĤ¸": 32234, "stax</w>": 32235, "transi": 32236, "doggie</w>": 32237, "atric": 32238, "halle</w>": 32239, "civic": 32240, "browning</w>": 32241, "leinster</w>": 32242, "catday</w>": 32243, "highland": 32244, "joyous</w>": 32245, "incumb": 32246, "orlando": 32247, "romo</w>": 32248, "colton</w>": 32249, "delta": 32250, "carab": 32251, "rotc</w>": 32252, "asteroid</w>": 32253, "goosebumps</w>": 32254, "mology</w>": 32255, "yoko</w>": 32256, "ands</w>": 32257, "tomorrows</w>": 32258, "redcarpet</w>": 32259, "smp</w>": 32260, "casio</w>": 32261, "ðŁ¤£ðŁ¤£ðŁ¤£</w>": 32262, "seau</w>": 32263, "rejection</w>": 32264, "rotating</w>": 32265, "bipartisan</w>": 32266, "thun": 32267, "mati</w>": 32268, "boni": 32269, "oll": 32270, "energye": 32271, "doit": 32272, "lj</w>": 32273, "motherhood</w>": 32274, "louise": 32275, "necklaces</w>": 32276, "elite": 32277, "nix</w>": 32278, "lcs</w>": 32279, "env": 32280, "glu</w>": 32281, "lesh</w>": 32282, "crank</w>": 32283, "susie</w>": 32284, "mclau": 32285, "sotu</w>": 32286, "crowley</w>": 32287, "ratri</w>": 32288, "used": 32289, "breton</w>": 32290, "alfredo</w>": 32291, "yeo": 32292, "travelpics</w>": 32293, "tipp": 32294, "ellison</w>": 32295, "saxophone</w>": 32296, "mered</w>": 32297, "heughan</w>": 32298, "taine</w>": 32299, "fes</w>": 32300, "viro": 32301, "supposedly</w>": 32302, "ias": 32303, "digestive</w>": 32304, "yle": 32305, "lizzy</w>": 32306, "wildlifephotography</w>": 32307, "brianna</w>": 32308, "westfield</w>": 32309, "rained</w>": 32310, "amher": 32311, "ðŁĺĦðŁĺĦ</w>": 32312, "distribute</w>": 32313, "bottom": 32314, "preserving</w>": 32315, "oiland": 32316, "crafty</w>": 32317, "descen": 32318, "colling": 32319, "shakespearesunday</w>": 32320, "rwc</w>": 32321, "angled</w>": 32322, "cian</w>": 32323, "tations</w>": 32324, "montage</w>": 32325, "meyers</w>": 32326, "francesca</w>": 32327, "ðŁĮ·": 32328, "wiggins</w>": 32329, "sanford</w>": 32330, "volunteer": 32331, "carra": 32332, "bark": 32333, "varied</w>": 32334, "plin</w>": 32335, "amu</w>": 32336, "kapil": 32337, "rockers</w>": 32338, "quind": 32339, "brane</w>": 32340, "inmate</w>": 32341, "ental": 32342, "improvis": 32343, "michigan": 32344, "retweeting</w>": 32345, "progressing</w>": 32346, "mercedesbenz</w>": 32347, "smoker</w>": 32348, "physiology</w>": 32349, "dorado</w>": 32350, "wattpad</w>": 32351, "hwa</w>": 32352, "srbachchan</w>": 32353, "wga</w>": 32354, "volatility</w>": 32355, "hire": 32356, "acap": 32357, "wnba</w>": 32358, "heinz</w>": 32359, "stitches</w>": 32360, "kidnapping</w>": 32361, "burys</w>": 32362, "limb</w>": 32363, "fitters</w>": 32364, "thumbnail</w>": 32365, "tone": 32366, "mirand": 32367, "desirable</w>": 32368, "addison</w>": 32369, "taran": 32370, "tamilnadu</w>": 32371, "spectator</w>": 32372, "sociology</w>": 32373, "amitshah</w>": 32374, "remotely</w>": 32375, "âĻ¦": 32376, "hamid</w>": 32377, "rds</w>": 32378, "glee": 32379, "smoothly</w>": 32380, "schro": 32381, "erc</w>": 32382, "laliga</w>": 32383, "heals</w>": 32384, "usf": 32385, "nishi": 32386, "dhu": 32387, "unil": 32388, "hle</w>": 32389, "tromb": 32390, "bhutan</w>": 32391, "pilipinas</w>": 32392, "seung": 32393, "whitman</w>": 32394, "tey</w>": 32395, "mince</w>": 32396, "snowboarding</w>": 32397, "reau</w>": 32398, "kker</w>": 32399, "avo</w>": 32400, "zachary</w>": 32401, "ranveer": 32402, "tik</w>": 32403, "govern</w>": 32404, "qual</w>": 32405, "becky": 32406, "anthropology</w>": 32407, "atten</w>": 32408, "groceries</w>": 32409, "debit</w>": 32410, "warp</w>": 32411, "silicon": 32412, "hawaii": 32413, "ðŁĴħ</w>": 32414, "pomegranate</w>": 32415, "peer": 32416, "oranges</w>": 32417, "peopleschoice</w>": 32418, "endure</w>": 32419, "ðŁĴĽðŁĴĽ": 32420, "ãĤ¹ãĥ": 32421, "acial</w>": 32422, "ahaha</w>": 32423, "stuk</w>": 32424, "imperial": 32425, "blond</w>": 32426, "powder": 32427, "knots</w>": 32428, "vince": 32429, "woodlands</w>": 32430, "dena</w>": 32431, "watchin</w>": 32432, "matcha</w>": 32433, "mahat": 32434, "galaxies</w>": 32435, "middlesbrough</w>": 32436, "kÃ¶": 32437, "stree</w>": 32438, "rescues</w>": 32439, "waldo</w>": 32440, "leroy</w>": 32441, "despic": 32442, "realities</w>": 32443, "tmnt</w>": 32444, "haq</w>": 32445, "uno": 32446, "pec": 32447, "bollywood": 32448, "blinds</w>": 32449, "designthinking</w>": 32450, "hems": 32451, "andhra</w>": 32452, "absen": 32453, "fans": 32454, "stech</w>": 32455, "shirehour</w>": 32456, "blaine</w>": 32457, "shakti</w>": 32458, "purely</w>": 32459, "ðŁıı</w>": 32460, "trafal": 32461, "keynes</w>": 32462, "grate": 32463, "tobias</w>": 32464, "spontaneous</w>": 32465, "saturated</w>": 32466, "cavalry</w>": 32467, "prisc": 32468, "ðŁĺĳ": 32469, "wht</w>": 32470, "passi": 32471, "~~~</w>": 32472, "virat</w>": 32473, "pattinson</w>": 32474, "lao": 32475, "weirdo</w>": 32476, "sympathy</w>": 32477, "juda": 32478, "occasionally</w>": 32479, "credited</w>": 32480, "statu": 32481, "esco": 32482, "hilly</w>": 32483, "escape": 32484, "discharge</w>": 32485, "seer</w>": 32486, "maynard</w>": 32487, "sudbury</w>": 32488, "zlat": 32489, "oral": 32490, "weer</w>": 32491, "encountered</w>": 32492, "smelling</w>": 32493, "oversight</w>": 32494, "ê¸": 32495, "thatcher</w>": 32496, "mackay</w>": 32497, "youcan": 32498, "freep": 32499, "freedoms</w>": 32500, "prophecy</w>": 32501, "hoe": 32502, "ishqba": 32503, "drake": 32504, "quits</w>": 32505, "pelled</w>": 32506, "turk</w>": 32507, "ovi": 32508, "wesleyan</w>": 32509, "newmusic": 32510, "legg</w>": 32511, "cheng": 32512, "hilli": 32513, "ayy</w>": 32514, "panties</w>": 32515, "adversity</w>": 32516, "adjac": 32517, "vaccination</w>": 32518, "juke": 32519, "gac</w>": 32520, "exceed</w>": 32521, "timesof": 32522, "staining</w>": 32523, "epcot</w>": 32524, "vital": 32525, "upward</w>": 32526, "bethesda</w>": 32527, "apark</w>": 32528, "mahi": 32529, "campfire</w>": 32530, "enchanting</w>": 32531, "rhapso": 32532, "hz</w>": 32533, "naver</w>": 32534, "fax": 32535, "validation</w>": 32536, "acad</w>": 32537, "nyr</w>": 32538, "asym": 32539, "coordinated</w>": 32540, "departed</w>": 32541, "allery</w>": 32542, "varies</w>": 32543, "sprite</w>": 32544, "chaplin</w>": 32545, "ssoccer</w>": 32546, "swat": 32547, "bret</w>": 32548, "reluct": 32549, "tunesapp</w>": 32550, "superstar": 32551, "reminiscing</w>": 32552, "oco</w>": 32553, "homegrown</w>": 32554, "doughnut</w>": 32555, "uncanny</w>": 32556, "lapd</w>": 32557, "thyroid</w>": 32558, "!âĿ¤ï¸ı</w>": 32559, "botanic</w>": 32560, "bres</w>": 32561, "spade</w>": 32562, "iste</w>": 32563, "echoes</w>": 32564, "dulil": 32565, "bursting</w>": 32566, "quiero</w>": 32567, "ðŁĳİ</w>": 32568, "loyola</w>": 32569, "amusement</w>": 32570, "hails</w>": 32571, "sleepy": 32572, "burglary</w>": 32573, "âľı": 32574, "rogue": 32575, "cotland</w>": 32576, "moors</w>": 32577, "lower": 32578, "wicked": 32579, "ðŁĶĬ": 32580, "competiti": 32581, "argentine</w>": 32582, "yvonne</w>": 32583, "kartikeyan</w>": 32584, "iliary</w>": 32585, "gatsby</w>": 32586, "precinct</w>": 32587, "sixty</w>": 32588, "naji": 32589, "cams</w>": 32590, "practitioner</w>": 32591, "ðŁĺ³ðŁĺ³": 32592, "pune": 32593, "negli": 32594, "julien</w>": 32595, "invaded</w>": 32596, "calibr": 32597, "clam</w>": 32598, "dubai": 32599, "muk</w>": 32600, "lantic": 32601, "product": 32602, "fedex</w>": 32603, "ï¸ı:</w>": 32604, "eura": 32605, "darius</w>": 32606, "sling</w>": 32607, "virtualreality</w>": 32608, "homestead</w>": 32609, "ðŁı³ï¸ıâĢįðŁĮĪ</w>": 32610, "paced</w>": 32611, "inha</w>": 32612, "pulmon": 32613, "lazy": 32614, "premiering</w>": 32615, "mastered</w>": 32616, "inhe": 32617, "congregation</w>": 32618, "bajo</w>": 32619, "sporting": 32620, "newjersey</w>": 32621, "horny</w>": 32622, "lmaoo</w>": 32623, "lengthy</w>": 32624, "dut": 32625, "yogh": 32626, "swearing</w>": 32627, "philosophical</w>": 32628, "papua</w>": 32629, "inski</w>": 32630, "knowles</w>": 32631, "dyke</w>": 32632, "âĢ²</w>": 32633, "token": 32634, "mcguire</w>": 32635, "riot": 32636, "probability</w>": 32637, "mccon": 32638, "gros</w>": 32639, "sumat": 32640, "cite</w>": 32641, "daa</w>": 32642, "onda</w>": 32643, "maddow</w>": 32644, "chew": 32645, "boardgames</w>": 32646, "sparked</w>": 32647, "reclaimed</w>": 32648, "adhd</w>": 32649, "nyse</w>": 32650, "imwithher</w>": 32651, "equinox</w>": 32652, "booths</w>": 32653, "balsamic</w>": 32654, "hazy</w>": 32655, "dorchester</w>": 32656, "agos</w>": 32657, "seaw": 32658, "moderator</w>": 32659, "seriea</w>": 32660, "andersen</w>": 32661, "pilgrim</w>": 32662, "âŃĲâŃĲ": 32663, "itchen</w>": 32664, "halli": 32665, "xton": 32666, "nathaniel</w>": 32667, "munition</w>": 32668, "celestial</w>": 32669, "gaf</w>": 32670, "zoom": 32671, "markle</w>": 32672, "penthouse</w>": 32673, "cale</w>": 32674, "sfa</w>": 32675, "barking</w>": 32676, "tucket</w>": 32677, "emery</w>": 32678, "calorie</w>": 32679, "lique": 32680, "adar</w>": 32681, "mcnam": 32682, "tortilla</w>": 32683, "woodpecker</w>": 32684, "motown</w>": 32685, "badger": 32686, "ayrshire</w>": 32687, "scramble</w>": 32688, "dday": 32689, "craziest</w>": 32690, "perrie</w>": 32691, "choco</w>": 32692, "caste</w>": 32693, "iot": 32694, "wrecked</w>": 32695, "selecting</w>": 32696, "ussr</w>": 32697, "graft</w>": 32698, "punt</w>": 32699, "labou": 32700, "irst</w>": 32701, "baek</w>": 32702, "ÛĮ</w>": 32703, "suki</w>": 32704, "queu": 32705, "achat</w>": 32706, "tester</w>": 32707, "augmented</w>": 32708, "wcvb</w>": 32709, "sinks</w>": 32710, "ðŁĵ»": 32711, "rake</w>": 32712, "interne": 32713, "because": 32714, "bellevue</w>": 32715, "unearth": 32716, "lighten</w>": 32717, "ðŁĺ£</w>": 32718, "turnaround</w>": 32719, "labeled</w>": 32720, "unemployed</w>": 32721, "twitterkurds</w>": 32722, "leia</w>": 32723, "hye": 32724, "greater": 32725, "ðŁĲİ</w>": 32726, "timed</w>": 32727, "ired": 32728, "ett": 32729, "limitations</w>": 32730, "cabe</w>": 32731, "sout</w>": 32732, "beech": 32733, "annihil": 32734, "retrac": 32735, "yoona</w>": 32736, "anger": 32737, "dennis": 32738, "supplying</w>": 32739, "diz</w>": 32740, "\"(</w>": 32741, "scur": 32742, "gunman</w>": 32743, "suho</w>": 32744, "sauvignon</w>": 32745, "à¸¥</w>": 32746, "wiley</w>": 32747, "landon</w>": 32748, "choreography</w>": 32749, "prehistoric</w>": 32750, "ðŁıĥ</w>": 32751, "vargas</w>": 32752, "assessments</w>": 32753, "pinnacle</w>": 32754, "dii</w>": 32755, "chamberlain</w>": 32756, "ìĪ": 32757, "vp": 32758, "presenters</w>": 32759, "deutsche</w>": 32760, "sunshine": 32761, "salutes</w>": 32762, "rone</w>": 32763, "busiest</w>": 32764, "-.-</w>": 32765, "motorists</w>": 32766, "hemisphere</w>": 32767, "alwx</w>": 32768, "psp</w>": 32769, "owa</w>": 32770, "denying</w>": 32771, "choc": 32772, "gutier": 32773, "hanuk": 32774, "muskete": 32775, "jaitley</w>": 32776, "sewage</w>": 32777, "tame</w>": 32778, "thinkers</w>": 32779, "shim</w>": 32780, "sequo": 32781, "papar": 32782, "middleeast</w>": 32783, "kwa": 32784, "keg</w>": 32785, "patagonia</w>": 32786, "noy</w>": 32787, "barÃ§a</w>": 32788, "takeoff</w>": 32789, "hea</w>": 32790, "à¬": 32791, "nsc": 32792, "gdc</w>": 32793, "ðŁĳĪ": 32794, "moustache</w>": 32795, "melania</w>": 32796, "thra</w>": 32797, "â¬Ĩï¸ı</w>": 32798, "pierced</w>": 32799, "zeus</w>": 32800, "fonts</w>": 32801, "bera</w>": 32802, "itiner": 32803, "qatar": 32804, "contrary</w>": 32805, "ireland": 32806, "ify</w>": 32807, "oulos</w>": 32808, "communal</w>": 32809, "fins</w>": 32810, "unpaid</w>": 32811, "paa</w>": 32812, "ðŁĳĩðŁı»</w>": 32813, "rios</w>": 32814, "oup</w>": 32815, "filler</w>": 32816, "cafeteria</w>": 32817, "à¸Ń</w>": 32818, "kasi</w>": 32819, "caliber</w>": 32820, "zulu</w>": 32821, "vsco</w>": 32822, "tsford</w>": 32823, "dragonfly</w>": 32824, "smokin</w>": 32825, "pist</w>": 32826, "psychologist</w>": 32827, "diplomat</w>": 32828, "webs</w>": 32829, "buccane": 32830, "à®¾</w>": 32831, "motivational": 32832, "dune": 32833, "bae": 32834, "cfs</w>": 32835, "without": 32836, "eron</w>": 32837, "iac": 32838, "atee</w>": 32839, "pension": 32840, "frazier</w>": 32841, "ensis</w>": 32842, "skis</w>": 32843, "parting</w>": 32844, "gery</w>": 32845, "territories</w>": 32846, "nachos</w>": 32847, "enight": 32848, "everlasting</w>": 32849, "msdhoni</w>": 32850, "tele</w>": 32851, "spun</w>": 32852, "podi": 32853, "sabah</w>": 32854, "environmentally</w>": 32855, "cease": 32856, "beaumont</w>": 32857, "marta</w>": 32858, "kelvin</w>": 32859, "hoff": 32860, "sunil</w>": 32861, "nda</w>": 32862, "cob</w>": 32863, "shale</w>": 32864, "reedus</w>": 32865, "unboxing</w>": 32866, "ubio</w>": 32867, "reopened</w>": 32868, "nall</w>": 32869, "capsules</w>": 32870, "marr</w>": 32871, "himalayas</w>": 32872, "sweeter</w>": 32873, "jaz</w>": 32874, "fmr</w>": 32875, "tweeter</w>": 32876, "dhaka</w>": 32877, "nau</w>": 32878, "demi": 32879, "dfs</w>": 32880, "taurus</w>": 32881, "fading</w>": 32882, "itutes</w>": 32883, "cip": 32884, "overflow</w>": 32885, "jeffrey": 32886, "donny</w>": 32887, "cartunesapp</w>": 32888, "ðŁįĳ</w>": 32889, "prefecture</w>": 32890, "danced</w>": 32891, "cpt</w>": 32892, "pleasing</w>": 32893, "italk</w>": 32894, "earthquakes</w>": 32895, "ulation</w>": 32896, "hio</w>": 32897, "ãĢĭ</w>": 32898, "antan": 32899, "nutrient</w>": 32900, "deere</w>": 32901, "selects</w>": 32902, "enrichment</w>": 32903, "riti": 32904, "trampol": 32905, "blamed</w>": 32906, "jia</w>": 32907, "contributors</w>": 32908, "chesapeake</w>": 32909, "pigeons</w>": 32910, "tribunal</w>": 32911, "maduro</w>": 32912, "wsu</w>": 32913, "ilove</w>": 32914, "efficiently</w>": 32915, "darcy</w>": 32916, "warms</w>": 32917, "arra": 32918, "ecu</w>": 32919, "hower</w>": 32920, "struggled</w>": 32921, "rajinikanth</w>": 32922, "ðŁĺ¢ðŁĺ¢": 32923, "housing": 32924, "strat</w>": 32925, "elix": 32926, "dispro": 32927, "raffic</w>": 32928, "thierry</w>": 32929, "nasty": 32930, "cfb</w>": 32931, "staffing</w>": 32932, "alma": 32933, "backers</w>": 32934, "henson</w>": 32935, "skywalker</w>": 32936, "realestate": 32937, "roos</w>": 32938, "nessy</w>": 32939, "chance": 32940, "cairns</w>": 32941, "cci": 32942, "pedal": 32943, "lyft</w>": 32944, "crossword</w>": 32945, "waiter</w>": 32946, "onlyin": 32947, "kruger</w>": 32948, "kir</w>": 32949, "alejandro</w>": 32950, "cartier</w>": 32951, "carrera</w>": 32952, "repaired</w>": 32953, "ouat</w>": 32954, "unclear</w>": 32955, "unbreakable</w>": 32956, "todayin": 32957, "queries</w>": 32958, "jody</w>": 32959, "genital</w>": 32960, "winner": 32961, "tol</w>": 32962, "kelowna</w>": 32963, "fascinated</w>": 32964, "ãĥ¬": 32965, "srisri</w>": 32966, "squared</w>": 32967, "sprung</w>": 32968, "negotiate</w>": 32969, "privately</w>": 32970, "aven</w>": 32971, ">>>>></w>": 32972, "gical</w>": 32973, "gavin": 32974, "chesterfield</w>": 32975, "zumba</w>": 32976, "orr</w>": 32977, "natalia</w>": 32978, "impeachment</w>": 32979, "mnl</w>": 32980, "carat</w>": 32981, "critique</w>": 32982, "credible</w>": 32983, "tracy": 32984, "tani</w>": 32985, "musik</w>": 32986, "jigsaw</w>": 32987, "gambia</w>": 32988, "tolkien</w>": 32989, "feu</w>": 32990, "asper": 32991, "savory</w>": 32992, "foxx</w>": 32993, "fitt": 32994, "marlon</w>": 32995, "lrt</w>": 32996, "vell</w>": 32997, "pbr</w>": 32998, "imprisoned</w>": 32999, "iom": 33000, "chul</w>": 33001, "windshield</w>": 33002, "kaye</w>": 33003, "baa</w>": 33004, "chord</w>": 33005, "sart</w>": 33006, "algon": 33007, "ministerial</w>": 33008, "natgeo": 33009, "lazio</w>": 33010, "norms</w>": 33011, "ðŁĳįðŁĳį": 33012, "licking</w>": 33013, "futbol</w>": 33014, "unsung</w>": 33015, "dallascowboys</w>": 33016, "shred</w>": 33017, "disturb": 33018, "devine</w>": 33019, "beards</w>": 33020, "chf</w>": 33021, "bday": 33022, "rosso</w>": 33023, "igor</w>": 33024, "ayi</w>": 33025, "siren</w>": 33026, "kair": 33027, "stiles</w>": 33028, "rof": 33029, "magnets</w>": 33030, "uncover</w>": 33031, "mouse": 33032, "banging</w>": 33033, "sighted</w>": 33034, "speople</w>": 33035, "impact": 33036, "rowland</w>": 33037, "kira</w>": 33038, "environment": 33039, "lovethe": 33040, "psis</w>": 33041, "mishra</w>": 33042, "glendale</w>": 33043, "cajun</w>": 33044, "oche</w>": 33045, "deception</w>": 33046, "sexist</w>": 33047, "straws</w>": 33048, "sga</w>": 33049, "buffer</w>": 33050, "apostle</w>": 33051, "spl</w>": 33052, "popup</w>": 33053, "ðŁļĹ": 33054, "rg": 33055, "uper</w>": 33056, "ballin</w>": 33057, "idy": 33058, "occasional</w>": 33059, "nationalpark</w>": 33060, "ðŁıĬ": 33061, "uan</w>": 33062, "innovation": 33063, "à¸«": 33064, "teaparty</w>": 33065, "rette</w>": 33066, "counterfe": 33067, "bha</w>": 33068, "recs</w>": 33069, "igen</w>": 33070, "ðŁĮĲ</w>": 33071, "hummingbird</w>": 33072, "cur</w>": 33073, "haven": 33074, "lazar": 33075, "pueblo</w>": 33076, "::": 33077, "zionist</w>": 33078, "opath": 33079, "inverness</w>": 33080, "promoter</w>": 33081, "cartoon": 33082, "cabinets</w>": 33083, "mahogany</w>": 33084, "surveying</w>": 33085, "rational</w>": 33086, "feeling": 33087, "testify</w>": 33088, "sow</w>": 33089, "ocon</w>": 33090, "à¸¢</w>": 33091, "neel": 33092, "maris</w>": 33093, "solitary</w>": 33094, "chemo": 33095, "radcliffe</w>": 33096, "simons</w>": 33097, "rosary</w>": 33098, "newer</w>": 33099, "jodie</w>": 33100, "retali": 33101, "prawn</w>": 33102, "paddy": 33103, "henge</w>": 33104, "kala</w>": 33105, "implant</w>": 33106, "aty</w>": 33107, "brentwood</w>": 33108, "paradox</w>": 33109, "enez</w>": 33110, "redesigned</w>": 33111, "pour": 33112, "wyd</w>": 33113, "alde": 33114, "à¯ģ</w>": 33115, "sold": 33116, "biomedical</w>": 33117, "à¹Ĥ": 33118, "tttt</w>": 33119, "matteo</w>": 33120, "yser</w>": 33121, "newton": 33122, "debun": 33123, "nerdy</w>": 33124, "lool</w>": 33125, "woon</w>": 33126, "elisabeth</w>": 33127, "ecc</w>": 33128, "whi</w>": 33129, "acho</w>": 33130, "salvage</w>": 33131, "salaries</w>": 33132, "quity</w>": 33133, "navigating</w>": 33134, "ophthal": 33135, "consoles</w>": 33136, "rebuilt</w>": 33137, "opec</w>": 33138, "asters</w>": 33139, "shored": 33140, "setlist</w>": 33141, "kathryn": 33142, "rhymes</w>": 33143, "revisiting</w>": 33144, "ashish</w>": 33145, "lift": 33146, "repost": 33147, "soleil</w>": 33148, "âı±</w>": 33149, "wealth": 33150, "saat": 33151, "wec</w>": 33152, "kingjames</w>": 33153, "flipkart</w>": 33154, "fieldwork</w>": 33155, "segu": 33156, "modal</w>": 33157, "bub</w>": 33158, "arers</w>": 33159, "ðŁįĴ</w>": 33160, "clooney</w>": 33161, "paddington</w>": 33162, "necessity</w>": 33163, "guthrie</w>": 33164, "pente": 33165, "limo</w>": 33166, "josie</w>": 33167, "artin": 33168, "enc</w>": 33169, "lhs</w>": 33170, "betrayal</w>": 33171, "infographics</w>": 33172, "ier": 33173, "moa</w>": 33174, "hearings</w>": 33175, "bonjour</w>": 33176, "symbolic</w>": 33177, "agro": 33178, "wedges</w>": 33179, "kristina</w>": 33180, "wildflower</w>": 33181, "athletic": 33182, "photography": 33183, "pesh": 33184, "cahill</w>": 33185, "chilean</w>": 33186, "goul": 33187, "fioren": 33188, "ðŁĳ¶</w>": 33189, "zil</w>": 33190, "skim": 33191, "badoo</w>": 33192, "delia</w>": 33193, "treble</w>": 33194, "ncc": 33195, "ðŁĩ¦ðŁĩ": 33196, "ahouse</w>": 33197, "bullock</w>": 33198, "solitude</w>": 33199, "Ø§ÙĨ</w>": 33200, "cancers</w>": 33201, "futureofwork</w>": 33202, "hutch</w>": 33203, "watershed</w>": 33204, "warmongers</w>": 33205, "spilled</w>": 33206, "colombo</w>": 33207, "moth": 33208, "associations</w>": 33209, "weighed</w>": 33210, "globalgoals</w>": 33211, "notjust": 33212, "christi</w>": 33213, "torg</w>": 33214, "sweating</w>": 33215, "maneu": 33216, "clusters</w>": 33217, "âĢ¼ï¸ıâĢ¼ï¸ı</w>": 33218, "taped</w>": 33219, "uly": 33220, "trusting</w>": 33221, "yusuf</w>": 33222, "tein</w>": 33223, "rab</w>": 33224, ",,,,</w>": 33225, "sinai</w>": 33226, "audible</w>": 33227, "explicit</w>": 33228, "crowns</w>": 33229, "schiz": 33230, "atleast</w>": 33231, "ðŁĹ£": 33232, "debra</w>": 33233, "jesuit</w>": 33234, "enegger</w>": 33235, "zhen</w>": 33236, "onesie</w>": 33237, "iit</w>": 33238, "ssf</w>": 33239, "gurgaon</w>": 33240, "chakra</w>": 33241, "bearcats</w>": 33242, "kran": 33243, "kawa</w>": 33244, "requesting</w>": 33245, "hanover</w>": 33246, "gend": 33247, "soros</w>": 33248, "mercy": 33249, "lovely": 33250, "doomed</w>": 33251, "timmy</w>": 33252, "kuz": 33253, "ull": 33254, "abram": 33255, "saison</w>": 33256, "ãĥ«": 33257, "cleaners</w>": 33258, "remo</w>": 33259, "circuits</w>": 33260, "barred</w>": 33261, "oth": 33262, "moist</w>": 33263, "madeleine</w>": 33264, "gallo</w>": 33265, "uj": 33266, "permits</w>": 33267, "heaviest</w>": 33268, "carols</w>": 33269, "azte": 33270, "giorgio</w>": 33271, "floats</w>": 33272, "declaring</w>": 33273, "usrc</w>": 33274, "minat</w>": 33275, "crafts": 33276, "prima</w>": 33277, "conveni": 33278, "nickelodeon</w>": 33279, "dancing": 33280, "ceremonial</w>": 33281, "blogg": 33282, "twp</w>": 33283, "anglican</w>": 33284, "shek</w>": 33285, "knick": 33286, "(((</w>": 33287, "hubbard</w>": 33288, "harvey": 33289, "hitman</w>": 33290, "feng</w>": 33291, "wesome</w>": 33292, "forza": 33293, "sword": 33294, "opus</w>": 33295, "brom</w>": 33296, "gibility</w>": 33297, "zal</w>": 33298, "munch</w>": 33299, "dancehall</w>": 33300, "greedy</w>": 33301, "hdmi</w>": 33302, "rebirth</w>": 33303, "ðŁĺĭðŁĺĭ</w>": 33304, "sworld</w>": 33305, "figurine</w>": 33306, "compost</w>": 33307, "kf": 33308, "engraving</w>": 33309, "giorno</w>": 33310, "stana</w>": 33311, "kman</w>": 33312, "hamster</w>": 33313, "composers</w>": 33314, "aje</w>": 33315, "functionality</w>": 33316, "polk</w>": 33317, "isons</w>": 33318, "airplanes</w>": 33319, "tese</w>": 33320, "horrors</w>": 33321, "muscat</w>": 33322, "given": 33323, "spence</w>": 33324, "ðŁĩ¸ðŁĩ": 33325, "eliot</w>": 33326, "achilles</w>": 33327, "freck": 33328, "cryptocurrencies</w>": 33329, "souther": 33330, "halo": 33331, "borneo</w>": 33332, "politic": 33333, "hahahahah</w>": 33334, "upstate</w>": 33335, "siena</w>": 33336, "obscure</w>": 33337, "hausen</w>": 33338, "lloyd": 33339, "happyfriday</w>": 33340, "motorbike</w>": 33341, "bona</w>": 33342, "americas": 33343, "hols</w>": 33344, "-(</w>": 33345, "sporty</w>": 33346, "unaware</w>": 33347, "revenues</w>": 33348, "christopher": 33349, "banksy</w>": 33350, "avan</w>": 33351, "evapor": 33352, "compress": 33353, "eyeliner</w>": 33354, "todos</w>": 33355, "buffy</w>": 33356, "renewableenergy</w>": 33357, "lyrical</w>": 33358, "archan": 33359, "rapist</w>": 33360, "fairtrade</w>": 33361, "lmaooo</w>": 33362, "beatz</w>": 33363, "proactive</w>": 33364, "lapse</w>": 33365, "irical</w>": 33366, "reversal</w>": 33367, "pode": 33368, "mcintyre</w>": 33369, "macau</w>": 33370, "ãĥķãĤ": 33371, "nashgrier</w>": 33372, "fsa</w>": 33373, "gall</w>": 33374, "çĶŁ": 33375, "perpetr": 33376, "ilya</w>": 33377, "configuration</w>": 33378, "%;</w>": 33379, "strange": 33380, "raci": 33381, "à¸ĩ</w>": 33382, "pickups</w>": 33383, "kovsky</w>": 33384, "mammal</w>": 33385, "wps</w>": 33386, "gable</w>": 33387, "comparative</w>": 33388, "zh": 33389, "saveour": 33390, "davey</w>": 33391, "onetsy</w>": 33392, "mussels</w>": 33393, "miser": 33394, "cristina</w>": 33395, "electron</w>": 33396, "crave</w>": 33397, "loren</w>": 33398, "precipitation</w>": 33399, "mz</w>": 33400, "ðŁį«</w>": 33401, "vincen": 33402, "snowboard</w>": 33403, "noida</w>": 33404, "ahn</w>": 33405, "marinated</w>": 33406, "gtr</w>": 33407, "townhall</w>": 33408, "minis": 33409, "bethel</w>": 33410, "advan": 33411, "sura": 33412, "shiel": 33413, "furry": 33414, "ðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤ</w>": 33415, "lynd": 33416, "soil": 33417, "scence</w>": 33418, "seneca</w>": 33419, "sharjah</w>": 33420, "dickens</w>": 33421, "credentials</w>": 33422, "avar": 33423, "perk</w>": 33424, "requiring</w>": 33425, "prefer": 33426, "jian</w>": 33427, "deca</w>": 33428, "rach</w>": 33429, "ingfor": 33430, "dele</w>": 33431, "beep</w>": 33432, "ðŁĴ»": 33433, "cisely</w>": 33434, "huddle</w>": 33435, "greensboro</w>": 33436, "hawking</w>": 33437, "hoax</w>": 33438, "hangar</w>": 33439, "çľ": 33440, "miso</w>": 33441, "lovin": 33442, "greta</w>": 33443, "abad": 33444, "logie</w>": 33445, "atan</w>": 33446, "snowflake</w>": 33447, "mahesh": 33448, "fearthe": 33449, "alkal": 33450, "bobblehead</w>": 33451, "bahn</w>": 33452, "judged</w>": 33453, "futu": 33454, "felix": 33455, "ðŁįĵ</w>": 33456, "pike": 33457, "deriv": 33458, "notices</w>": 33459, "auer</w>": 33460, "dissuper</w>": 33461, "orda": 33462, "wipes</w>": 33463, "amino</w>": 33464, "strikers</w>": 33465, "footb": 33466, "dramas</w>": 33467, "punching</w>": 33468, "scoreless</w>": 33469, "hemingway</w>": 33470, "bih</w>": 33471, "ballad</w>": 33472, "chatter": 33473, "ammo</w>": 33474, "klein": 33475, "fabrication</w>": 33476, "karim</w>": 33477, "zend": 33478, "histo": 33479, "volta</w>": 33480, "rocky": 33481, "marketer</w>": 33482, "xtreme</w>": 33483, "sequencing</w>": 33484, "paradigm</w>": 33485, "cleats</w>": 33486, "booming</w>": 33487, "âģłâģł</w>": 33488, "blockade</w>": 33489, "prompts</w>": 33490, "yoghurt</w>": 33491, "purpose": 33492, "nur</w>": 33493, "regulate</w>": 33494, "noisy</w>": 33495, "ingrid</w>": 33496, "birdwatching</w>": 33497, "bartender</w>": 33498, "Ùĥ": 33499, "wordof": 33500, "chaotic</w>": 33501, "shorty</w>": 33502, "eldest</w>": 33503, "zapp": 33504, "onceuponatime</w>": 33505, "flyo": 33506, "ritos</w>": 33507, "mikequind": 33508, "ðŁĲ´</w>": 33509, "registering</w>": 33510, ".]</w>": 33511, "adol": 33512, "gggg</w>": 33513, "purge</w>": 33514, "kidlit</w>": 33515, "arbor": 33516, "valves</w>": 33517, "synagogue</w>": 33518, "oth</w>": 33519, "unanimous</w>": 33520, "verification</w>": 33521, "darrell</w>": 33522, "ãģĦ": 33523, "vanderbilt</w>": 33524, "tapestry</w>": 33525, "prosper</w>": 33526, "diddy</w>": 33527, "drafting</w>": 33528, "decep": 33529, "marquis</w>": 33530, "stint</w>": 33531, "michaeljackson</w>": 33532, "peeled</w>": 33533, "menus</w>": 33534, "bbb</w>": 33535, "scare": 33536, "email": 33537, "wrigley</w>": 33538, "itis": 33539, "fell": 33540, "somethin</w>": 33541, "barra</w>": 33542, "edgar": 33543, "dipping</w>": 33544, "puddle</w>": 33545, "slade</w>": 33546, "learner</w>": 33547, "jalen</w>": 33548, "ðŁ§Ĳ</w>": 33549, "thedaily": 33550, "mikequindazzi</w>": 33551, "jux": 33552, "iqbal</w>": 33553, "mckinney</w>": 33554, "raiser</w>": 33555, "efan": 33556, "drone": 33557, "cato</w>": 33558, "picket</w>": 33559, "crowe</w>": 33560, "latt": 33561, "uko</w>": 33562, "giuseppe</w>": 33563, "hini</w>": 33564, "synthesi": 33565, "pontifex</w>": 33566, "songwriting</w>": 33567, "tod</w>": 33568, "switches</w>": 33569, "dinners</w>": 33570, "hq": 33571, "gabrielle</w>": 33572, "pensacola</w>": 33573, "circle": 33574, "exposes</w>": 33575, "evs</w>": 33576, "riyadh</w>": 33577, "promen": 33578, "ock": 33579, "saj": 33580, "citation</w>": 33581, "brewco</w>": 33582, "josi": 33583, "epaper</w>": 33584, "drif": 33585, "pointless</w>": 33586, "tangled</w>": 33587, "cripp": 33588, "lineups</w>": 33589, "fairies</w>": 33590, "daze</w>": 33591, "mourn</w>": 33592, "bladder</w>": 33593, "salz": 33594, "burundi</w>": 33595, "bookmark</w>": 33596, "thepeople</w>": 33597, "subsequ": 33598, "principal": 33599, "sker</w>": 33600, "courtney": 33601, "aoki</w>": 33602, "racers</w>": 33603, "adm</w>": 33604, "moma</w>": 33605, "criticalrole": 33606, "houn</w>": 33607, "shedding</w>": 33608, "saka</w>": 33609, "aceous</w>": 33610, "mckay</w>": 33611, "husbands</w>": 33612, "Â½</w>": 33613, "meda</w>": 33614, "accusations</w>": 33615, "rosel": 33616, "ncis</w>": 33617, "witnessing</w>": 33618, "orama</w>": 33619, "gods": 33620, "hilton": 33621, "elman</w>": 33622, "ÃŃn</w>": 33623, "megap": 33624, "craven</w>": 33625, "announcer</w>": 33626, "criteri": 33627, "sheffieldissuper</w>": 33628, "militant</w>": 33629, "consul</w>": 33630, "hooded</w>": 33631, "abyss</w>": 33632, "bx</w>": 33633, "madam": 33634, "locu": 33635, "maryam": 33636, "manicure</w>": 33637, "gratis</w>": 33638, "actresses</w>": 33639, "rosario</w>": 33640, "thisdayin": 33641, "kingly</w>": 33642, "gnome</w>": 33643, "celine</w>": 33644, "rous": 33645, "heel": 33646, "lilac</w>": 33647, "vishal</w>": 33648, "abh</w>": 33649, "thorns</w>": 33650, "sls</w>": 33651, "neal": 33652, "constructing</w>": 33653, "beren": 33654, "slang</w>": 33655, "mains</w>": 33656, "farra": 33657, "sarko": 33658, "paige": 33659, "guiller": 33660, "lala</w>": 33661, "iceberg</w>": 33662, "noun</w>": 33663, "planners</w>": 33664, "ummm</w>": 33665, "ouses</w>": 33666, "illary</w>": 33667, "maan</w>": 33668, "boxing": 33669, "zipper</w>": 33670, "srinagar</w>": 33671, "miguel": 33672, "ostr": 33673, "mpo</w>": 33674, "responsibly</w>": 33675, "lanterns</w>": 33676, "appliance</w>": 33677, "xb</w>": 33678, "grenade</w>": 33679, "neglect</w>": 33680, "dysle": 33681, "hammock</w>": 33682, "nectar</w>": 33683, "witcher</w>": 33684, "rgv</w>": 33685, "dience</w>": 33686, "serbian</w>": 33687, "seeded</w>": 33688, "cruz": 33689, "bish": 33690, "sphe": 33691, "eq</w>": 33692, "skyrim</w>": 33693, "algebra</w>": 33694, "philately</w>": 33695, "bungalow</w>": 33696, "geoff": 33697, "yves</w>": 33698, "demanded</w>": 33699, "considerations</w>": 33700, "thevamp": 33701, "pawankalyan</w>": 33702, "coded</w>": 33703, "gritty</w>": 33704, "eruption</w>": 33705, "seinfeld</w>": 33706, "unidenti": 33707, "ëĭĪ": 33708, "worm": 33709, "acus</w>": 33710, "seung</w>": 33711, "dung</w>": 33712, "roland": 33713, "sud</w>": 33714, "divisions</w>": 33715, "ablanc": 33716, "shortest</w>": 33717, "jf</w>": 33718, "poun": 33719, "plantbased</w>": 33720, "beto</w>": 33721, "tougher</w>": 33722, "mco</w>": 33723, "donet": 33724, "markus</w>": 33725, "vfl</w>": 33726, "ðŁıł</w>": 33727, "opening": 33728, "coward</w>": 33729, "cabernet</w>": 33730, "oxi": 33731, "burlesque</w>": 33732, "sandra": 33733, "sumo</w>": 33734, "consist</w>": 33735, "thot</w>": 33736, "cayman</w>": 33737, "motorola</w>": 33738, "gutierrez</w>": 33739, "dslr</w>": 33740, "yw": 33741, "nobel": 33742, "novice</w>": 33743, "momsdemand</w>": 33744, "grunge</w>": 33745, "spor</w>": 33746, "dcc</w>": 33747, "presses</w>": 33748, "slist</w>": 33749, "allotment</w>": 33750, "vocational</w>": 33751, "ftc</w>": 33752, "puja</w>": 33753, "loven": 33754, "uttarak": 33755, "tandem</w>": 33756, "shep": 33757, "comedians</w>": 33758, "anatom": 33759, "cantwait</w>": 33760, "healthyeating</w>": 33761, "westside</w>": 33762, "margins</w>": 33763, "chiang</w>": 33764, "asbestos</w>": 33765, "stupidity</w>": 33766, "problematic</w>": 33767, "fitbit</w>": 33768, ":$</w>": 33769, "ceilings</w>": 33770, "shua</w>": 33771, "protections</w>": 33772, "biotic</w>": 33773, "bengali</w>": 33774, "rests</w>": 33775, "biennale</w>": 33776, "timo</w>": 33777, "culmin": 33778, "eminent</w>": 33779, "affection": 33780, "unbelievably</w>": 33781, "individually</w>": 33782, "canvassing</w>": 33783, "whitt": 33784, "novasco": 33785, "chinson</w>": 33786, "hpe</w>": 33787, "gow</w>": 33788, "gloucestershire</w>": 33789, "pao</w>": 33790, "threshold</w>": 33791, "chevron</w>": 33792, "sine</w>": 33793, "wether": 33794, "ppie</w>": 33795, "aquino</w>": 33796, "antwerp</w>": 33797, "âĸ¬": 33798, "poon": 33799, "instaf": 33800, "equine</w>": 33801, "cinematography</w>": 33802, "nbafinals</w>": 33803, "valiant</w>": 33804, "kilkenny</w>": 33805, "terence</w>": 33806, "systemic</w>": 33807, "srl</w>": 33808, "pound": 33809, "madeira</w>": 33810, "plough": 33811, "trecht</w>": 33812, "mated</w>": 33813, "mpd</w>": 33814, "ransomware</w>": 33815, "phin</w>": 33816, "liqui": 33817, "bbce": 33818, "boomer": 33819, "istandwith": 33820, "conju": 33821, "rte": 33822, "nara</w>": 33823, "foolish</w>": 33824, "dashing</w>": 33825, "viernes</w>": 33826, "brite</w>": 33827, "dau</w>": 33828, "juniper</w>": 33829, "aida</w>": 33830, "younow</w>": 33831, "razer</w>": 33832, "dei": 33833, "repeating</w>": 33834, "comforting</w>": 33835, "adjacent</w>": 33836, "eto</w>": 33837, "casted</w>": 33838, "chatur": 33839, "muer": 33840, "synth": 33841, "sanitary</w>": 33842, "macle": 33843, "independent": 33844, "lawful</w>": 33845, "eerie</w>": 33846, "hor</w>": 33847, "ðŁĴŃ</w>": 33848, "amrit": 33849, "velo</w>": 33850, "stationery</w>": 33851, "muf": 33852, "maymay</w>": 33853, "contemplating</w>": 33854, "elaborate</w>": 33855, "gregor": 33856, "dries</w>": 33857, "accol": 33858, "à¸ļ": 33859, "schwarzenegger</w>": 33860, "illnesses</w>": 33861, "daybreak</w>": 33862, "followback</w>": 33863, "collusion</w>": 33864, "electronic": 33865, "jovi</w>": 33866, "hiroshima</w>": 33867, "taw": 33868, "homec": 33869, "micah</w>": 33870, "quitting</w>": 33871, "frosting</w>": 33872, "benfica</w>": 33873, "heli": 33874, "sical</w>": 33875, "piccad": 33876, "corporate": 33877, "mentorship</w>": 33878, "youare": 33879, "singer": 33880, "shiva": 33881, "rune": 33882, "inger": 33883, "rium</w>": 33884, "playable</w>": 33885, "doop</w>": 33886, "willow": 33887, "terre": 33888, "nip": 33889, "atd</w>": 33890, "warbler</w>": 33891, "professionally</w>": 33892, "erase</w>": 33893, "proceed</w>": 33894, "pedestrians</w>": 33895, "mischief</w>": 33896, "bending</w>": 33897, "alaskan</w>": 33898, "ckett</w>": 33899, "mop</w>": 33900, "ddles</w>": 33901, "shutter</w>": 33902, "geared</w>": 33903, "ateneo</w>": 33904, "madeline</w>": 33905, "gations</w>": 33906, "osha</w>": 33907, "derick</w>": 33908, "swild": 33909, "angry": 33910, "patents</w>": 33911, "hunk</w>": 33912, "decreased</w>": 33913, "fry": 33914, "ðŁĴĸðŁĴĸðŁĴĸ</w>": 33915, "salon": 33916, "quantities</w>": 33917, "dario</w>": 33918, "nigel": 33919, "kuma</w>": 33920, "jenn": 33921, "happye": 33922, "xxx": 33923, "rexperience</w>": 33924, "pros": 33925, "ausch": 33926, "relessly</w>": 33927, "hamburger</w>": 33928, "fukushima</w>": 33929, "erne": 33930, "statec": 33931, "rend": 33932, "mayfield</w>": 33933, "jone": 33934, "lefty</w>": 33935, "bernstein</w>": 33936, "smil": 33937, "generates</w>": 33938, "forestation</w>": 33939, "bandits</w>": 33940, "tayo</w>": 33941, "rca</w>": 33942, "acci</w>": 33943, "rodrigo</w>": 33944, "knapp</w>": 33945, "elovers</w>": 33946, "vegetation</w>": 33947, "ural</w>": 33948, "left": 33949, "ħï¸ı</w>": 33950, "worldre": 33951, "suri</w>": 33952, "embark</w>": 33953, "wson</w>": 33954, "bayou</w>": 33955, "muller</w>": 33956, "movers</w>": 33957, "ðŁķº": 33958, "presbyter": 33959, "lf": 33960, "cree": 33961, "batb</w>": 33962, "salam</w>": 33963, "demonstrations</w>": 33964, "anec": 33965, "npc</w>": 33966, "itics</w>": 33967, "tography</w>": 33968, "reinst": 33969, "thurst</w>": 33970, "tale": 33971, "offences</w>": 33972, "smartcity</w>": 33973, "brotha</w>": 33974, "oftheyear</w>": 33975, "invaluable</w>": 33976, "earn": 33977, "ðŁĳıðŁı½</w>": 33978, "kremlin</w>": 33979, "grady</w>": 33980, "townfc</w>": 33981, "guernsey</w>": 33982, "maha</w>": 33983, "contagious</w>": 33984, "drex": 33985, "been": 33986, "(Â£</w>": 33987, "nativity</w>": 33988, "ktm</w>": 33989, "somerhalder</w>": 33990, "compounds</w>": 33991, "íķĺ": 33992, "\"âĢ¦</w>": 33993, "afg</w>": 33994, "ottnews</w>": 33995, "hound": 33996, "firefly</w>": 33997, "cilan": 33998, "donetsk</w>": 33999, "volunteered</w>": 34000, "akira</w>": 34001, "èª": 34002, "singul": 34003, "sth</w>": 34004, "drowned</w>": 34005, "mando</w>": 34006, "heir</w>": 34007, "ðŁİīðŁİĪ</w>": 34008, "taxis</w>": 34009, "yuki</w>": 34010, "veld</w>": 34011, "kans</w>": 34012, "elk": 34013, "rants</w>": 34014, "hashtag": 34015, "teng": 34016, "rog</w>": 34017, "aat": 34018, "grub</w>": 34019, "eber": 34020, "inindia</w>": 34021, "colossus</w>": 34022, "signi": 34023, "soever</w>": 34024, "milestones</w>": 34025, "dero</w>": 34026, "differential</w>": 34027, "phuket</w>": 34028, "mastermind</w>": 34029, "angh": 34030, "melani": 34031, "broker": 34032, "actorvijay</w>": 34033, "stunned</w>": 34034, "continuity</w>": 34035, "affl": 34036, "vocal": 34037, "perennial</w>": 34038, "fiancÃ©</w>": 34039, "incomplete</w>": 34040, "hunts</w>": 34041, "reissue</w>": 34042, "dominates</w>": 34043, "turmeric</w>": 34044, "roam</w>": 34045, "rion</w>": 34046, "bagged</w>": 34047, "nassau</w>": 34048, "fut</w>": 34049, "xox</w>": 34050, "nationaltrust</w>": 34051, "joye": 34052, "sano</w>": 34053, "hearthstone</w>": 34054, "disrespect</w>": 34055, "lees</w>": 34056, "hse</w>": 34057, "siberian</w>": 34058, "offee</w>": 34059, "restock</w>": 34060, "wolfgang</w>": 34061, "regan</w>": 34062, "plano</w>": 34063, "unwind</w>": 34064, "repar": 34065, "mille</w>": 34066, "],</w>": 34067, "skull": 34068, "fatally</w>": 34069, "conceptual</w>": 34070, "ðŁĮ²": 34071, "fÃ©": 34072, "berto</w>": 34073, "bms</w>": 34074, "ua": 34075, "magna</w>": 34076, "notredame</w>": 34077, "lete</w>": 34078, "laundering</w>": 34079, "heartwarming</w>": 34080, "buffett</w>": 34081, "goat": 34082, "peabo": 34083, "windmill</w>": 34084, "vac</w>": 34085, "continually</w>": 34086, "azalea</w>": 34087, "membrane</w>": 34088, "cancels</w>": 34089, "makeyourown": 34090, "athered</w>": 34091, "pto</w>": 34092, "torpe": 34093, "ðŁĺł</w>": 34094, "ðŁĴ§</w>": 34095, "scares</w>": 34096, "leaking</w>": 34097, "zet": 34098, "pixels</w>": 34099, "aci</w>": 34100, "khil": 34101, "marathi</w>": 34102, "ðŁĻıðŁı½": 34103, "ula": 34104, "tamu</w>": 34105, "chandigarh</w>": 34106, "zagre": 34107, "aab</w>": 34108, "pronounced</w>": 34109, "aubrey</w>": 34110, "sander</w>": 34111, "punta</w>": 34112, "harlow</w>": 34113, "icelan": 34114, "celebratory</w>": 34115, "sot</w>": 34116, "unciation</w>": 34117, "struly": 34118, "mcdowell</w>": 34119, "deepika</w>": 34120, "reminders</w>": 34121, "mystical</w>": 34122, "ctc</w>": 34123, "chatted</w>": 34124, "sica</w>": 34125, "bargains</w>": 34126, "chhat": 34127, "rubin</w>": 34128, "mnet</w>": 34129, "oilandgas</w>": 34130, "pelican</w>": 34131, "oat</w>": 34132, "morality</w>": 34133, "kour": 34134, "ih</w>": 34135, "nuclear": 34136, "gcu</w>": 34137, "richer</w>": 34138, "venezia</w>": 34139, "mma": 34140, "leith</w>": 34141, "accompany</w>": 34142, "richmond": 34143, "sportsnet</w>": 34144, "baahu": 34145, "smuggling</w>": 34146, "mmi</w>": 34147, "ðŁĩ®ðŁĩª</w>": 34148, "twists</w>": 34149, "sahib</w>": 34150, ".....": 34151, "ambitions</w>": 34152, "illo": 34153, "historical": 34154, "forec": 34155, "showbiz</w>": 34156, "ponies</w>": 34157, "chasers</w>": 34158, "remodel": 34159, "willing": 34160, "princesses</w>": 34161, "ample</w>": 34162, "cushions</w>": 34163, "acles</w>": 34164, "lotr</w>": 34165, "dach": 34166, "anthe": 34167, "incorporate</w>": 34168, "newbury</w>": 34169, "kiri": 34170, "friedrich</w>": 34171, "abv</w>": 34172, "ballers</w>": 34173, "albert": 34174, "ðŁĳŃ": 34175, "leti</w>": 34176, "nanop": 34177, "cide</w>": 34178, "analo": 34179, "nsf</w>": 34180, "))))</w>": 34181, "griffiths</w>": 34182, "valenci": 34183, "roano": 34184, "funrun</w>": 34185, "babysitting</w>": 34186, "caday</w>": 34187, "entre": 34188, "uck</w>": 34189, "slug</w>": 34190, "tical": 34191, "thesims</w>": 34192, "roar": 34193, "carney</w>": 34194, "gam</w>": 34195, "stowe</w>": 34196, "fid": 34197, "bunny": 34198, "shamrock</w>": 34199, "pecu": 34200, "molina</w>": 34201, "gocougs</w>": 34202, "contributes</w>": 34203, "transformation": 34204, "moy</w>": 34205, "vaj": 34206, "severy": 34207, "antioxidants</w>": 34208, "thirteen</w>": 34209, "sightseeing</w>": 34210, "lj": 34211, "reversible</w>": 34212, "oddly</w>": 34213, "hookah</w>": 34214, "nouvel": 34215, "halal</w>": 34216, "fei</w>": 34217, "stables</w>": 34218, "mult": 34219, "hopped</w>": 34220, "braids</w>": 34221, "interchange</w>": 34222, "ghanaian</w>": 34223, "wwww": 34224, "ethno": 34225, "conjunction</w>": 34226, "agov</w>": 34227, "yeti</w>": 34228, "earthand": 34229, "tsp</w>": 34230, "conserve</w>": 34231, "heirloom</w>": 34232, "metaphor</w>": 34233, "woof": 34234, "torio</w>": 34235, "selfless</w>": 34236, "nwa</w>": 34237, "emilia</w>": 34238, "ylene</w>": 34239, "yxe</w>": 34240, "giar": 34241, "moderating</w>": 34242, "probz</w>": 34243, "bfi</w>": 34244, "neer": 34245, "dummy</w>": 34246, "hanukkah</w>": 34247, "webber</w>": 34248, "kv</w>": 34249, "eyebrow</w>": 34250, "dagger</w>": 34251, "sump": 34252, "rages</w>": 34253, "orkney</w>": 34254, "tbo</w>": 34255, "halsey</w>": 34256, "assignments</w>": 34257, "tronic</w>": 34258, "scrib": 34259, "coon": 34260, "anwar</w>": 34261, "#âĢİ</w>": 34262, "jalape": 34263, "florida": 34264, "quaid</w>": 34265, "hawkeyes</w>": 34266, "âĻ¡âĻ¡</w>": 34267, "streetcar</w>": 34268, "rog": 34269, "datlantic": 34270, "granola</w>": 34271, "unchanged</w>": 34272, "expectation</w>": 34273, "Ùĩ": 34274, "marlin</w>": 34275, "gummy</w>": 34276, "ðŁĻıðŁı¾": 34277, "awarenessmonth</w>": 34278, "oilpainting</w>": 34279, "muth</w>": 34280, "perch</w>": 34281, "junto</w>": 34282, "villagers</w>": 34283, "morg": 34284, "cheated</w>": 34285, "webcomic</w>": 34286, "thefuture</w>": 34287, "dps</w>": 34288, "lakings</w>": 34289, "mentioning</w>": 34290, "voor": 34291, "identities</w>": 34292, "accord": 34293, "mcgu": 34294, "lpga</w>": 34295, "rumour</w>": 34296, "massively</w>": 34297, "mpls</w>": 34298, "healy</w>": 34299, "date": 34300, "spoli</w>": 34301, "revisited</w>": 34302, "ont": 34303, "aland": 34304, "scrutiny</w>": 34305, "lakeland</w>": 34306, "blending</w>": 34307, "</</w>": 34308, "ankara</w>": 34309, "jamiedor": 34310, "metabolic</w>": 34311, "fences</w>": 34312, "anny": 34313, "åħ": 34314, "semicon": 34315, "oott</w>": 34316, "spaceship</w>": 34317, "wacky</w>": 34318, "leta</w>": 34319, "apac</w>": 34320, "shee</w>": 34321, "inherit": 34322, "dores</w>": 34323, "ðŁĩ¨ðŁĩ¦": 34324, "gente</w>": 34325, "twick": 34326, "rims</w>": 34327, "galve": 34328, "deville</w>": 34329, "kingfisher</w>": 34330, "scorpio</w>": 34331, "owl": 34332, "alar": 34333, "varian</w>": 34334, "ðŁĹĵ": 34335, "venetian</w>": 34336, "stardust</w>": 34337, "thenorth</w>": 34338, "qing</w>": 34339, "harrington</w>": 34340, "consulate</w>": 34341, "spectacle</w>": 34342, "hobbs</w>": 34343, "turks</w>": 34344, "greer</w>": 34345, "mating</w>": 34346, "ðŁİĢ": 34347, "ðŁĮĢ</w>": 34348, "directs</w>": 34349, "íĭ": 34350, "pompeo</w>": 34351, "voiced</w>": 34352, "laos</w>": 34353, "tzu</w>": 34354, "prome": 34355, "prism</w>": 34356, "merc": 34357, "fortunately</w>": 34358, "bcfc</w>": 34359, "mcdonnell</w>": 34360, "notsorry</w>": 34361, "smiled</w>": 34362, "tba</w>": 34363, "forwar": 34364, "midterm</w>": 34365, "darby</w>": 34366, "weinstein</w>": 34367, "upgrading</w>": 34368, "wolff</w>": 34369, "bronco</w>": 34370, "cabello</w>": 34371, "ðŁ¥ĩ": 34372, "fiable</w>": 34373, "sharpe</w>": 34374, "battered</w>": 34375, "sato</w>": 34376, "mythical</w>": 34377, "instapic</w>": 34378, "prepped</w>": 34379, "enium</w>": 34380, "espo": 34381, "diaper</w>": 34382, "explanations</w>": 34383, "whopping</w>": 34384, "ragnar": 34385, "peel": 34386, "antibiotic</w>": 34387, "lacks</w>": 34388, "harrison": 34389, "lism</w>": 34390, "aul</w>": 34391, "quail</w>": 34392, "martina</w>": 34393, "sentencing</w>": 34394, "scams</w>": 34395, "didi</w>": 34396, "tronics</w>": 34397, "ãħłãħł</w>": 34398, "goff</w>": 34399, "zain": 34400, "paramore</w>": 34401, "chained</w>": 34402, "clinton": 34403, "liff</w>": 34404, "cottages</w>": 34405, "emon</w>": 34406, "reverend</w>": 34407, "consumer": 34408, "cean": 34409, "tany": 34410, "lumpur</w>": 34411, "ebay": 34412, "stool": 34413, "ðŁĺ»ðŁĺ»": 34414, "tapro": 34415, "hath</w>": 34416, "modernart</w>": 34417, "justine</w>": 34418, "proverb</w>": 34419, "appy</w>": 34420, "trax</w>": 34421, "manifest</w>": 34422, "ambu": 34423, "naik</w>": 34424, "pepp": 34425, "rsd</w>": 34426, "merchants</w>": 34427, "kitchener</w>": 34428, "shifted</w>": 34429, "lizz": 34430, "âĺħâĺħâĺħâĺħ": 34431, "âĢĶâĢĶâĢĶâĢĶâĢĶâĢĶâĢĶâĢĶ": 34432, "utopia</w>": 34433, "tomo</w>": 34434, "outed</w>": 34435, "comers</w>": 34436, "chiropractic</w>": 34437, "bookclub</w>": 34438, "cindy": 34439, "prohibition</w>": 34440, "seuss</w>": 34441, "ë¯¼": 34442, "thinkin</w>": 34443, "rrrr</w>": 34444, "gofund": 34445, "tack</w>": 34446, "omb</w>": 34447, "catastrophic</w>": 34448, "lingu": 34449, "guildford</w>": 34450, "botd</w>": 34451, "à¥ĭ</w>": 34452, "planter</w>": 34453, "^^": 34454, "wink": 34455, "kathmandu</w>": 34456, "stoppers</w>": 34457, "smoothies</w>": 34458, "reefs</w>": 34459, "hind": 34460, "bellamy</w>": 34461, "Ħë": 34462, "wastewater</w>": 34463, "voor</w>": 34464, "natl</w>": 34465, "!]</w>": 34466, "reel": 34467, "yap</w>": 34468, "scooby</w>": 34469, "workspace</w>": 34470, "corinthians</w>": 34471, "blun": 34472, "obligation</w>": 34473, "gbbo</w>": 34474, "dyson</w>": 34475, "cravings</w>": 34476, "ellington</w>": 34477, "dapl</w>": 34478, "wrexham</w>": 34479, "earthandclouds</w>": 34480, "ukrunchat</w>": 34481, "positioned</w>": 34482, "kalb</w>": 34483, "foursquare</w>": 34484, "jock</w>": 34485, "impending</w>": 34486, "evening": 34487, "athy": 34488, "proclaimed</w>": 34489, "cites</w>": 34490, "annapolis</w>": 34491, "sani</w>": 34492, "marth": 34493, "irl": 34494, "accommo": 34495, "kaa</w>": 34496, "fina</w>": 34497, "yaa</w>": 34498, "disper": 34499, "ecar": 34500, "bhak": 34501, "willy": 34502, "ðŁĺĢðŁĺĢ</w>": 34503, "mcdermott</w>": 34504, "moj": 34505, "generational</w>": 34506, "usaid</w>": 34507, "training": 34508, "lonely": 34509, "lores</w>": 34510, "impecc": 34511, "âĢĲ</w>": 34512, "beavers</w>": 34513, "maki</w>": 34514, "heb</w>": 34515, "aapl</w>": 34516, "åı": 34517, "wolverhampton</w>": 34518, "leaderboard</w>": 34519, "meu</w>": 34520, "cfa</w>": 34521, "eastern": 34522, "hur</w>": 34523, "civilwar</w>": 34524, "ourage</w>": 34525, "horned</w>": 34526, "lehigh</w>": 34527, "awards": 34528, "evident</w>": 34529, "gigab": 34530, "rous</w>": 34531, "madel": 34532, "robyn</w>": 34533, "urgently</w>": 34534, "kors</w>": 34535, "enas</w>": 34536, "heisman</w>": 34537, "bambam</w>": 34538, "fabian</w>": 34539, "fom": 34540, "evaluating</w>": 34541, "assembly": 34542, "outsourcing</w>": 34543, "huntsville</w>": 34544, "ðŁĶª</w>": 34545, "justified</w>": 34546, "cashier</w>": 34547, "spaper": 34548, "buckeye</w>": 34549, "analytical</w>": 34550, "illuminati</w>": 34551, "autho": 34552, "oj</w>": 34553, "shade": 34554, "geelong</w>": 34555, "whey</w>": 34556, "heaton</w>": 34557, "terribly</w>": 34558, "elek": 34559, "uncharted</w>": 34560, "sdlive</w>": 34561, "motocross</w>": 34562, "hermes</w>": 34563, "darshan</w>": 34564, "darlington</w>": 34565, "cashmere</w>": 34566, "gripping</w>": 34567, "cilantro</w>": 34568, "punish</w>": 34569, "...:</w>": 34570, "ðŁĴĦ</w>": 34571, "instance</w>": 34572, "deri": 34573, "lobal</w>": 34574, "mukher": 34575, "spar</w>": 34576, "thinker</w>": 34577, "fremont</w>": 34578, "compiled</w>": 34579, "colorado": 34580, "vigne</w>": 34581, "smd</w>": 34582, "whead</w>": 34583, "village": 34584, "leek</w>": 34585, "formulae</w>": 34586, "tares</w>": 34587, "persistence</w>": 34588, "??????</w>": 34589, "pedago": 34590, "hez": 34591, "alzheimers</w>": 34592, "vulture</w>": 34593, "offence</w>": 34594, "isgreat</w>": 34595, "suffra": 34596, "kickin</w>": 34597, "hmmmm</w>": 34598, "broadway": 34599, "ï¸ı@</w>": 34600, "arti</w>": 34601, "allison": 34602, "endorses</w>": 34603, "ryu</w>": 34604, "lollipop</w>": 34605, "soybean</w>": 34606, "kendall": 34607, "cera</w>": 34608, "invade</w>": 34609, "(ðŁĵ·:</w>": 34610, "converter</w>": 34611, "carpets</w>": 34612, "hobo": 34613, "frit": 34614, "peac": 34615, "esqu": 34616, "ernan</w>": 34617, "ouf</w>": 34618, "anil</w>": 34619, "differ</w>": 34620, "ching": 34621, "brecht</w>": 34622, "spg</w>": 34623, "davenport</w>": 34624, "strava</w>": 34625, "severn</w>": 34626, "ngos</w>": 34627, "storians</w>": 34628, "fete</w>": 34629, "paramedic</w>": 34630, "jhb</w>": 34631, "alamo</w>": 34632, "sneaking</w>": 34633, "goldcoast</w>": 34634, "roofs</w>": 34635, "isil</w>": 34636, "depicted</w>": 34637, "projections</w>": 34638, "numb": 34639, "oss</w>": 34640, "epi</w>": 34641, "glucose</w>": 34642, "zidane</w>": 34643, "infiniti</w>": 34644, "íĺĦ</w>": 34645, "ransom</w>": 34646, "tonics</w>": 34647, "falk": 34648, "gler</w>": 34649, "outw": 34650, "ress": 34651, "weekly": 34652, "theon</w>": 34653, "nole</w>": 34654, "ðŁĩªðŁĩº</w>": 34655, "volley</w>": 34656, "summar": 34657, "negativity</w>": 34658, "samson</w>": 34659, "yew</w>": 34660, "ausvotes</w>": 34661, "jul": 34662, "judy": 34663, "fart</w>": 34664, "prayed</w>": 34665, "palate</w>": 34666, "multicultural</w>": 34667, "doubleheader</w>": 34668, "cyclones</w>": 34669, "pierre": 34670, "ãģ¨": 34671, "âĺłï¸ı</w>": 34672, "rtw</w>": 34673, "converting</w>": 34674, "wirral</w>": 34675, "lari": 34676, "irrelevant</w>": 34677, "austinmahone</w>": 34678, "anche</w>": 34679, "yaan</w>": 34680, "sdf</w>": 34681, "$.</w>": 34682, "exploding</w>": 34683, "ultimate": 34684, "profici": 34685, "gofundme</w>": 34686, "cellence</w>": 34687, "epstein</w>": 34688, "bullied</w>": 34689, "septic</w>": 34690, "à®¤</w>": 34691, "lumber</w>": 34692, "cuff": 34693, "vscocam</w>": 34694, "plor": 34695, "à¸¥": 34696, "seok": 34697, "roto": 34698, "venezuelan</w>": 34699, "sorta</w>": 34700, "spirited</w>": 34701, "danielpadilla</w>": 34702, "teamsisd</w>": 34703, "radioactive</w>": 34704, "icelandic</w>": 34705, "ðŁĴ¤": 34706, "vere</w>": 34707, "accommodate</w>": 34708, "shipp": 34709, "otter": 34710, "olina</w>": 34711, "ego": 34712, "sula": 34713, "sanantonio</w>": 34714, "deas</w>": 34715, "similarities</w>": 34716, "âļ¾</w>": 34717, "yom": 34718, "broward</w>": 34719, "å°": 34720, "cancun</w>": 34721, "verify</w>": 34722, "onte</w>": 34723, "candlelight</w>": 34724, "ìłķ": 34725, "infants</w>": 34726, "azam</w>": 34727, "ðŁĺ°</w>": 34728, "leven</w>": 34729, "unstable</w>": 34730, "bloomington</w>": 34731, "xford</w>": 34732, "contour</w>": 34733, "yp</w>": 34734, "innovator</w>": 34735, "histories</w>": 34736, "poy</w>": 34737, "lololol</w>": 34738, "expires</w>": 34739, "catalo": 34740, "billboards</w>": 34741, "anab": 34742, "elic": 34743, "novascotia</w>": 34744, "faire": 34745, "ìĿ´</w>": 34746, "rockwell</w>": 34747, "grille</w>": 34748, "aztec</w>": 34749, "johor</w>": 34750, "urstruly": 34751, "firen": 34752, "dunlop</w>": 34753, "idle</w>": 34754, "portman</w>": 34755, "joes</w>": 34756, "txhsfb</w>": 34757, "holm": 34758, "chamele": 34759, "underworld</w>": 34760, "loss": 34761, "tiem": 34762, "therapists</w>": 34763, "pasture</w>": 34764, "paste": 34765, "ingnow</w>": 34766, "vulcan</w>": 34767, "ragon</w>": 34768, "larkin</w>": 34769, "oshi</w>": 34770, "hoco</w>": 34771, "childhood": 34772, "umbrel": 34773, "successor</w>": 34774, "kathy": 34775, "izen</w>": 34776, "°ï¸ı</w>": 34777, "shareholders</w>": 34778, "olga</w>": 34779, "aib</w>": 34780, "heap</w>": 34781, "flaming</w>": 34782, "rou</w>": 34783, "airtel</w>": 34784, "ratt</w>": 34785, "zane</w>": 34786, "vow</w>": 34787, "thorough</w>": 34788, "snag": 34789, "parth</w>": 34790, "unconscious</w>": 34791, "vey": 34792, "newrelease</w>": 34793, "ghee</w>": 34794, "croatian</w>": 34795, "facilitating</w>": 34796, "swanson</w>": 34797, "astoria</w>": 34798, "tology</w>": 34799, "mastery</w>": 34800, "ðŁ¤ĳ</w>": 34801, "bilbao</w>": 34802, "troupe</w>": 34803, "theori": 34804, "cheyenne</w>": 34805, "rott": 34806, "shoreline</w>": 34807, "grasso</w>": 34808, "masterchef</w>": 34809, "+)</w>": 34810, "vix</w>": 34811, "ellenshow</w>": 34812, "asg</w>": 34813, "anak": 34814, "kuya</w>": 34815, "safarilive</w>": 34816, "debuting</w>": 34817, "blum</w>": 34818, "listener</w>": 34819, "vins</w>": 34820, "bookshelf</w>": 34821, "smartcities</w>": 34822, "makeyourownlane</w>": 34823, ";;": 34824, "ðŁĲ¯": 34825, "rizz": 34826, "onward</w>": 34827, "bulldog": 34828, "bearish</w>": 34829, "viruses</w>": 34830, "frigh": 34831, "linden</w>": 34832, "weiser</w>": 34833, "snt</w>": 34834, "gona</w>": 34835, "dresden</w>": 34836, "flanders</w>": 34837, "cuk</w>": 34838, "wheeling</w>": 34839, "bau</w>": 34840, "atuesday</w>": 34841, "surfers</w>": 34842, "swift": 34843, "mccall</w>": 34844, "arbitration</w>": 34845, "awd</w>": 34846, "monc": 34847, "bine</w>": 34848, "atx": 34849, "refr": 34850, "miro": 34851, "posey</w>": 34852, "nare": 34853, "ritter</w>": 34854, "âģ¦</w>": 34855, "playbook</w>": 34856, "blowout</w>": 34857, "sportsmanship</w>": 34858, "soooooo</w>": 34859, "malayalam</w>": 34860, "grims": 34861, "burbank</w>": 34862, "infinity": 34863, "sargent</w>": 34864, "oitnb</w>": 34865, "josephine</w>": 34866, "skipping</w>": 34867, "parkin": 34868, "excursion</w>": 34869, "seminars</w>": 34870, "johar</w>": 34871, "partridge</w>": 34872, "postgame</w>": 34873, "llll": 34874, "blanche</w>": 34875, "tempting</w>": 34876, "mna</w>": 34877, "luka</w>": 34878, "isers</w>": 34879, "toffee</w>": 34880, "barron</w>": 34881, "hemmings</w>": 34882, "sae</w>": 34883, "gohawks</w>": 34884, "cupid</w>": 34885, "limbs</w>": 34886, "conse": 34887, "uncommon</w>": 34888, "zada</w>": 34889, "headshot</w>": 34890, "soils</w>": 34891, "pioneer": 34892, "mamma</w>": 34893, "semitic</w>": 34894, "pandey</w>": 34895, "jamiedornan</w>": 34896, "splits</w>": 34897, "vela</w>": 34898, "soni": 34899, "raff": 34900, "tmobile</w>": 34901, "âŀĸ</w>": 34902, "prawns</w>": 34903, "liter</w>": 34904, "enjoyment</w>": 34905, "eggplant</w>": 34906, "tub": 34907, "cultural": 34908, "usic": 34909, "suspicion</w>": 34910, "sycam": 34911, "summed</w>": 34912, "madu": 34913, "hock": 34914, "upwards</w>": 34915, "eyeing</w>": 34916, "rive</w>": 34917, "assassins</w>": 34918, "âĤ¬": 34919, "outfy</w>": 34920, "chives</w>": 34921, "tner</w>": 34922, "lais</w>": 34923, "porridge</w>": 34924, "saddest</w>": 34925, "wcc</w>": 34926, "vicki</w>": 34927, "snails</w>": 34928, "bizitalk</w>": 34929, "millan": 34930, "ðŁĮį": 34931, "samoa</w>": 34932, "jing": 34933, "mikey": 34934, "guj": 34935, "chelms": 34936, "eligibility</w>": 34937, "armada</w>": 34938, "throp</w>": 34939, "surgeries</w>": 34940, "ãĤ¿": 34941, "mohawk</w>": 34942, "exits</w>": 34943, "mem</w>": 34944, "islington</w>": 34945, "cme</w>": 34946, "landfill</w>": 34947, "kaitlyn</w>": 34948, "ðŁİ¼": 34949, "combinations</w>": 34950, "tomorrowland</w>": 34951, "verb</w>": 34952, "cora</w>": 34953, "precisely</w>": 34954, "naom": 34955, "ðŁĨķ</w>": 34956, "shrink</w>": 34957, "softly</w>": 34958, "mercede": 34959, "mandel": 34960, "poodle</w>": 34961, "ballerina</w>": 34962, "soph</w>": 34963, "juxta": 34964, "yat</w>": 34965, "aryan</w>": 34966, "hesitate</w>": 34967, "lowered</w>": 34968, "gular</w>": 34969, "dungeonsand": 34970, "ronan</w>": 34971, "myri": 34972, "spf</w>": 34973, "menopau": 34974, "grasp</w>": 34975, "pathi": 34976, "feasi": 34977, "flaw</w>": 34978, "shistory</w>": 34979, "steward": 34980, "ggle": 34981, "fayre</w>": 34982, "clique</w>": 34983, "credibility</w>": 34984, "yog": 34985, "section": 34986, "musko": 34987, "seville</w>": 34988, "nott</w>": 34989, "calm": 34990, "mateo</w>": 34991, "indicted</w>": 34992, "fiba</w>": 34993, "byl</w>": 34994, "lino</w>": 34995, "ukin": 34996, "!!#</w>": 34997, "enigma</w>": 34998, "sirius</w>": 34999, "busc": 35000, "ðŁįĬ": 35001, "mackerel</w>": 35002, "psalms</w>": 35003, "aat</w>": 35004, "tomorrowspaper": 35005, "ðŁĺĸ</w>": 35006, "pfc</w>": 35007, "...........</w>": 35008, "shrek</w>": 35009, "mullet</w>": 35010, "osh</w>": 35011, "dangerously</w>": 35012, "immensely</w>": 35013, "amur": 35014, "ðŁįĤ": 35015, "propor": 35016, "sya</w>": 35017, "londonmarathon</w>": 35018, "above": 35019, "obligatory</w>": 35020, "prov</w>": 35021, "racha</w>": 35022, "alexis": 35023, "primary": 35024, "shh</w>": 35025, "ethernet</w>": 35026, "dstv</w>": 35027, "cougar": 35028, "unlucky</w>": 35029, "nil</w>": 35030, "steakhouse</w>": 35031, "mela</w>": 35032, "fcbayern</w>": 35033, "causeway</w>": 35034, "catherine": 35035, "fluorescent</w>": 35036, "nxt": 35037, "tokyo": 35038, "ausp": 35039, "relegation</w>": 35040, "quizz": 35041, "shoreditch</w>": 35042, "proudtobe": 35043, "promos</w>": 35044, "interacting</w>": 35045, "homebrew</w>": 35046, "daesh</w>": 35047, "wpg</w>": 35048, "steadily</w>": 35049, "provinces</w>": 35050, "ballots</w>": 35051, "iah</w>": 35052, "alto": 35053, "<<<</w>": 35054, "youu</w>": 35055, "riley": 35056, "preference</w>": 35057, "traverse</w>": 35058, "incense</w>": 35059, "ammunition</w>": 35060, "hodges</w>": 35061, "#@</w>": 35062, "hailstate</w>": 35063, "tartan</w>": 35064, "witchcraft</w>": 35065, "ventilation</w>": 35066, "libertarian</w>": 35067, "!âĢ¦</w>": 35068, "owes</w>": 35069, "%!</w>": 35070, "ongchang</w>": 35071, "brushing</w>": 35072, "leic": 35073, "fiber": 35074, "underattack</w>": 35075, "download": 35076, "expir": 35077, "hyo</w>": 35078, "pompey</w>": 35079, "mcbride</w>": 35080, "yag": 35081, "stree": 35082, "combat": 35083, "tending</w>": 35084, "aira": 35085, "guggen": 35086, "abra</w>": 35087, "inna</w>": 35088, "flips</w>": 35089, "awal</w>": 35090, "mach</w>": 35091, "dollar": 35092, "inspirations</w>": 35093, "zum</w>": 35094, "odu": 35095, "itty</w>": 35096, "videogame</w>": 35097, "aquaman</w>": 35098, "haru</w>": 35099, "belfast": 35100, "jeb</w>": 35101, "butch</w>": 35102, "usgs</w>": 35103, "calculus</w>": 35104, "goyal</w>": 35105, "morgen</w>": 35106, "xfinity</w>": 35107, "standup": 35108, "contracep": 35109, "sabre</w>": 35110, "nabe": 35111, "insecure</w>": 35112, "generously</w>": 35113, "epitome</w>": 35114, "lw</w>": 35115, "tca</w>": 35116, "narratives</w>": 35117, "donnell</w>": 35118, "pandas</w>": 35119, "bergh</w>": 35120, "tut</w>": 35121, "keral": 35122, "felicity</w>": 35123, "brampton</w>": 35124, "quintet</w>": 35125, "nomore": 35126, "ðŁĶĳ</w>": 35127, "loi</w>": 35128, "alhamdulil": 35129, "ðŁĶ¥ðŁĶĹ</w>": 35130, "stoner": 35131, "shawl</w>": 35132, "clinical": 35133, "brendan": 35134, "gone": 35135, "flawed</w>": 35136, "trippy</w>": 35137, "jg</w>": 35138, "allocation</w>": 35139, "poaching</w>": 35140, "vevo</w>": 35141, "mocks</w>": 35142, "leftist</w>": 35143, "bonuses</w>": 35144, "condemned</w>": 35145, "ability": 35146, "stating</w>": 35147, "microbiome</w>": 35148, "biologist</w>": 35149, "foryou</w>": 35150, "wahlberg</w>": 35151, "ssor</w>": 35152, "iftar</w>": 35153, "wul": 35154, "ÑĦÐ¾ÑĤ": 35155, "pomer": 35156, "meme": 35157, "verte": 35158, "trell</w>": 35159, "trait</w>": 35160, "inlet</w>": 35161, "hormones</w>": 35162, "deliberately</w>": 35163, "villar": 35164, "battleship</w>": 35165, "pbl</w>": 35166, "twenti": 35167, "hokies</w>": 35168, "dalail": 35169, "saya</w>": 35170, "mayfair</w>": 35171, "hans": 35172, "diets</w>": 35173, "âľ¨âľ¨": 35174, "odin</w>": 35175, "hotspur</w>": 35176, "papi</w>": 35177, "kana</w>": 35178, "kamp": 35179, "finna</w>": 35180, "flotus</w>": 35181, "tians</w>": 35182, "unicorns</w>": 35183, "tribeca</w>": 35184, "changers</w>": 35185, "foreground</w>": 35186, "outa</w>": 35187, "invaders</w>": 35188, "gettys": 35189, "tomorrowspaperstoday</w>": 35190, "macmillan</w>": 35191, "handwritten</w>": 35192, "wfp</w>": 35193, "ude</w>": 35194, "stateof": 35195, "based": 35196, "âĺģï¸ı</w>": 35197, "casm</w>": 35198, "psyched</w>": 35199, "historians</w>": 35200, "fold": 35201, "dda</w>": 35202, "aggrav": 35203, "pans</w>": 35204, "greenway</w>": 35205, "ausv": 35206, "ðŁĺ¶</w>": 35207, "shraddha": 35208, "index": 35209, "besti": 35210, "zimmer</w>": 35211, "tness</w>": 35212, "eyeshadow</w>": 35213, "otte</w>": 35214, "gots</w>": 35215, "distributing</w>": 35216, "promin": 35217, "yol</w>": 35218, "acea</w>": 35219, "tramrahim</w>": 35220, "hooper</w>": 35221, "supreme": 35222, "jammin</w>": 35223, "intuitive</w>": 35224, "qualifications</w>": 35225, "slim": 35226, "siddi": 35227, "jayne</w>": 35228, "tripping</w>": 35229, "gtx</w>": 35230, "puns</w>": 35231, "emanuel</w>": 35232, "omg": 35233, "midsummer</w>": 35234, "into": 35235, "succulent</w>": 35236, "rien</w>": 35237, "newmexico</w>": 35238, "oor</w>": 35239, "hooking</w>": 35240, "inf</w>": 35241, "ðŁ¤Ŀ</w>": 35242, "flirting</w>": 35243, "nahi</w>": 35244, "gfriend</w>": 35245, "tps</w>": 35246, "helix</w>": 35247, "zs</w>": 35248, "onie</w>": 35249, "ctf</w>": 35250, "kris": 35251, "irresistible</w>": 35252, "flap</w>": 35253, "ðŁĳıðŁı»ðŁĳıðŁı»": 35254, "uswnt</w>": 35255, "rud": 35256, "ramps</w>": 35257, "pinoy</w>": 35258, "otw</w>": 35259, "lolz</w>": 35260, "lowering</w>": 35261, "favorite": 35262, "tmc</w>": 35263, "phrases</w>": 35264, "hermi": 35265, "averaging</w>": 35266, "embr": 35267, "beno": 35268, "estuary</w>": 35269, "sleeve": 35270, "ribbons</w>": 35271, "tash": 35272, "à¸¹</w>": 35273, "xf</w>": 35274, "awgs</w>": 35275, "sunited</w>": 35276, "breweries</w>": 35277, "anirud": 35278, "punches</w>": 35279, "oldie</w>": 35280, "ipads</w>": 35281, "wifey</w>": 35282, "landlords</w>": 35283, "dji": 35284, "gunner</w>": 35285, "íķ´</w>": 35286, "texan": 35287, "exop": 35288, "cassandra</w>": 35289, "soff": 35290, "ðŁļ«</w>": 35291, "ighton</w>": 35292, "bakers": 35293, "awarenessweek</w>": 35294, "vall</w>": 35295, "earp</w>": 35296, "btsbbmas</w>": 35297, "apologizes</w>": 35298, "âļĵï¸ı</w>": 35299, "wasps</w>": 35300, "statesman</w>": 35301, "snatch</w>": 35302, "watchdog</w>": 35303, "rafi": 35304, "afterparty</w>": 35305, "spike": 35306, "jer</w>": 35307, "periph": 35308, "rnc</w>": 35309, "mull</w>": 35310, "leen": 35311, "shies</w>": 35312, "lieu</w>": 35313, "urstrulymahesh</w>": 35314, "merton</w>": 35315, "desai</w>": 35316, "shif": 35317, "ðŁĮ±": 35318, "pedic": 35319, "gosling</w>": 35320, "arranging</w>": 35321, "wwg</w>": 35322, "geny</w>": 35323, "youuu</w>": 35324, "netflix": 35325, "ettes</w>": 35326, "kwi": 35327, "bernardino</w>": 35328, "amiga</w>": 35329, "Ø¨</w>": 35330, "kashmiri</w>": 35331, "tings</w>": 35332, "emeritus</w>": 35333, "decat": 35334, "abdomin": 35335, "dci</w>": 35336, "phases</w>": 35337, "djan": 35338, "beam": 35339, "opry</w>": 35340, "ished</w>": 35341, "theellenshow</w>": 35342, "thest</w>": 35343, "habitats</w>": 35344, "toons</w>": 35345, "mclaughlin</w>": 35346, "ripper</w>": 35347, "microbiology</w>": 35348, "talaga</w>": 35349, "clueless</w>": 35350, "ssu</w>": 35351, "croche": 35352, "bromance</w>": 35353, "longevity</w>": 35354, "zagreb</w>": 35355, "prevented</w>": 35356, "trave": 35357, "spoilt</w>": 35358, "darryl</w>": 35359, "migraine</w>": 35360, "alcat": 35361, "dddd</w>": 35362, "viv</w>": 35363, "serpent</w>": 35364, "mattel</w>": 35365, "jama</w>": 35366, "conquest</w>": 35367, "îĦ": 35368, "samsung": 35369, "presbyterian</w>": 35370, "ketch</w>": 35371, "firefox</w>": 35372, "motif</w>": 35373, "lec</w>": 35374, "chopping</w>": 35375, "cherno": 35376, "jann": 35377, "ðŁĲ°": 35378, "prolon": 35379, "wakeup</w>": 35380, "convergence</w>": 35381, "merseyside</w>": 35382, "heartbroken</w>": 35383, "looming</w>": 35384, "hallucin": 35385, "maize</w>": 35386, "communism</w>": 35387, "moh</w>": 35388, "twitterstorians</w>": 35389, "sergey</w>": 35390, "reseller</w>": 35391, "favorable</w>": 35392, "edgy</w>": 35393, "reiter": 35394, "malaga</w>": 35395, "liveme</w>": 35396, "kahn</w>": 35397, "pulsion</w>": 35398, "bigg</w>": 35399, "kimkardashian</w>": 35400, "atio</w>": 35401, "tyranny</w>": 35402, "ruption</w>": 35403, "qant": 35404, "proven": 35405, "byz": 35406, "pushaw": 35407, "kristin": 35408, "eer": 35409, "tardis</w>": 35410, "riz</w>": 35411, "awaken</w>": 35412, "miko</w>": 35413, "undocumented</w>": 35414, "pathfinder</w>": 35415, "indirect</w>": 35416, "resembles</w>": 35417, "hler</w>": 35418, "concealed</w>": 35419, "scandal": 35420, "reim": 35421, "dnb</w>": 35422, "critters</w>": 35423, "attendant</w>": 35424, "apprenticeships</w>": 35425, "aau</w>": 35426, "screamed</w>": 35427, "lsu": 35428, "fah</w>": 35429, "harbour": 35430, "edd</w>": 35431, "batsman</w>": 35432, "liss</w>": 35433, "misha</w>": 35434, "spaniel</w>": 35435, "itf</w>": 35436, "advancement</w>": 35437, "fac</w>": 35438, "closeup</w>": 35439, "cecilia</w>": 35440, "medic</w>": 35441, "narcissi": 35442, "lavish</w>": 35443, "giac": 35444, "mays</w>": 35445, "leit": 35446, "winewednesday</w>": 35447, "pushaward": 35448, "letto</w>": 35449, "currents</w>": 35450, "bugatti</w>": 35451, "outine</w>": 35452, "wj</w>": 35453, "undo</w>": 35454, "lerosis</w>": 35455, "devotional</w>": 35456, "ðŁĳ«</w>": 35457, "onna</w>": 35458, "faisal</w>": 35459, "sauna</w>": 35460, "himachal</w>": 35461, "amii": 35462, "à®®</w>": 35463, "dizzy</w>": 35464, "screenwriting</w>": 35465, "phx": 35466, "spn": 35467, "icki</w>": 35468, "agirl</w>": 35469, "fishes</w>": 35470, "wbz</w>": 35471, "pim</w>": 35472, "boar</w>": 35473, "acid": 35474, "!..</w>": 35475, "rockefeller</w>": 35476, "nga</w>": 35477, "drastically</w>": 35478, "simplify</w>": 35479, "drumming</w>": 35480, "autumnal</w>": 35481, "gurmee": 35482, "lorde</w>": 35483, "joann": 35484, "giveup</w>": 35485, "bour</w>": 35486, "amura</w>": 35487, "derland</w>": 35488, "simpler</w>": 35489, "watson": 35490, "trident</w>": 35491, "concordia</w>": 35492, "bellum</w>": 35493, "brek": 35494, "dumplings</w>": 35495, "vion</w>": 35496, "dungeonsanddragons</w>": 35497, "spri": 35498, "ascension</w>": 35499, "wildatlantic": 35500, "ust": 35501, "robins</w>": 35502, "legion": 35503, "insist</w>": 35504, "jaro</w>": 35505, "guess": 35506, "sob</w>": 35507, "bighit</w>": 35508, "poolside</w>": 35509, "negotiating</w>": 35510, "mcgill</w>": 35511, "bild</w>": 35512, "technicians</w>": 35513, "mitigation</w>": 35514, "ajaydevgn</w>": 35515, "bto</w>": 35516, "anten": 35517, "cosmopolitan</w>": 35518, "ðŁĺĬðŁĺĬðŁĺĬðŁĺĬ": 35519, "patrioti": 35520, "temper</w>": 35521, "promenade</w>": 35522, "navajo</w>": 35523, "namm</w>": 35524, "wrinkles</w>": 35525, "dcfc</w>": 35526, "leach</w>": 35527, "brunette</w>": 35528, "rf": 35529, "coutinho</w>": 35530, "alti": 35531, "traditionally</w>": 35532, "optome": 35533, "naz</w>": 35534, "accordingly</w>": 35535, "recard</w>": 35536, "deets</w>": 35537, "swell": 35538, "posure</w>": 35539, "whitening</w>": 35540, "stranger": 35541, "illion": 35542, "hereford</w>": 35543, "uwu</w>": 35544, "robber</w>": 35545, "cotswolds</w>": 35546, "clen": 35547, "gorge": 35548, "namaste</w>": 35549, "relish</w>": 35550, "griff</w>": 35551, "adrenaline</w>": 35552, "blasio</w>": 35553, "vale": 35554, "ê²": 35555, "tolerate</w>": 35556, "railminindia</w>": 35557, "jensen": 35558, "hoven</w>": 35559, "ellu": 35560, "obsole": 35561, "eisenhower</w>": 35562, "unidentified</w>": 35563, "thanniversary</w>": 35564, "bodyguard</w>": 35565, "Ø¯</w>": 35566, "idge</w>": 35567, "schal": 35568, "stockport</w>": 35569, "sni</w>": 35570, "retaining</w>": 35571, "popo</w>": 35572, "pixie</w>": 35573, "olithic</w>": 35574, "kier</w>": 35575, "hajj</w>": 35576, "saz</w>": 35577, "corbin</w>": 35578, "!!!!!!!!!!</w>": 35579, "vit</w>": 35580, "megat": 35581, "deh</w>": 35582, "circuit": 35583, "affleck</w>": 35584, "theoretical</w>": 35585, "hopeless</w>": 35586, "uab</w>": 35587, "slump</w>": 35588, "bice": 35589, "jammed</w>": 35590, "letstalk</w>": 35591, "cani": 35592, "sideways</w>": 35593, "labyrinth</w>": 35594, "refs</w>": 35595, "hahn</w>": 35596, "jared": 35597, "ðŁį¹</w>": 35598, "jambo": 35599, "phyl": 35600, "enhancement</w>": 35601, "ctr": 35602, "fullest</w>": 35603, "seye</w>": 35604, "doba</w>": 35605, "choic": 35606, "yos</w>": 35607, "cbj</w>": 35608, "andrÃ©</w>": 35609, "rewatch</w>": 35610, "prima": 35611, "doctrine</w>": 35612, "forgets</w>": 35613, "uhm</w>": 35614, "around": 35615, "ule</w>": 35616, "artlovers</w>": 35617, "shiraz</w>": 35618, "harth</w>": 35619, "extor": 35620, "Å¡": 35621, "unexpectedly</w>": 35622, "elius</w>": 35623, "yx</w>": 35624, "emmy": 35625, "seac": 35626, "ðŁĳĩðŁĳĩðŁĳĩ</w>": 35627, "corrected</w>": 35628, "combu": 35629, "womanc": 35630, "cough": 35631, "whatson": 35632, "publishes</w>": 35633, "diversity": 35634, "backbone</w>": 35635, "lockdown</w>": 35636, "mesmerizing</w>": 35637, "norte</w>": 35638, "mab</w>": 35639, "designer": 35640, "íģ": 35641, "ragh": 35642, "molecules</w>": 35643, "getoutside</w>": 35644, "thebeatles</w>": 35645, "semiconduc": 35646, "nacho</w>": 35647, "lunes</w>": 35648, "hammers</w>": 35649, "sultan": 35650, "oon": 35651, "feren": 35652, "attach</w>": 35653, "arqu": 35654, "uttarakhand</w>": 35655, "sash</w>": 35656, ";-": 35657, "tread</w>": 35658, "iko": 35659, "arthur": 35660, "scandinavian</w>": 35661, "ration</w>": 35662, "gael</w>": 35663, "chargeable</w>": 35664, "fishy</w>": 35665, "vma</w>": 35666, "handbags</w>": 35667, "chara</w>": 35668, "ayne</w>": 35669, "defam": 35670, "settlers</w>": 35671, "qadri</w>": 35672, "palais</w>": 35673, "inwx</w>": 35674, "apocalyptic</w>": 35675, "pooja</w>": 35676, "aes</w>": 35677, "atories</w>": 35678, "proofing</w>": 35679, "nlp</w>": 35680, "tsla</w>": 35681, "vina</w>": 35682, "lido</w>": 35683, "deephouse</w>": 35684, "informatics</w>": 35685, "vv</w>": 35686, "ppings</w>": 35687, "diss</w>": 35688, "Ã¯": 35689, "uhuru</w>": 35690, "stony</w>": 35691, "betrayed</w>": 35692, "baff": 35693, "myra</w>": 35694, "aspen": 35695, "allowance</w>": 35696, "tamara</w>": 35697, "cif</w>": 35698, "corbett</w>": 35699, "serge</w>": 35700, "digo</w>": 35701, "ambigu": 35702, "painters</w>": 35703, "pcr</w>": 35704, "pca</w>": 35705, "noms</w>": 35706, "loft": 35707, "vee": 35708, "opendata</w>": 35709, "ðŁĲ±": 35710, "alexandre</w>": 35711, "identifies</w>": 35712, "fantasyfootball</w>": 35713, "reproduction</w>": 35714, "bromley</w>": 35715, "wareagle</w>": 35716, "mmer": 35717, "pss": 35718, "cues</w>": 35719, "ayat</w>": 35720, "hutchinson</w>": 35721, "sarac": 35722, "jackman</w>": 35723, "irah</w>": 35724, "apink</w>": 35725, "cols</w>": 35726, "aussies</w>": 35727, "execs</w>": 35728, "dayton": 35729, "ðŁĻĨ</w>": 35730, "imv": 35731, "haram": 35732, "chuckle</w>": 35733, "authenticity</w>": 35734, "ardo": 35735, "incubator</w>": 35736, "à¸ª</w>": 35737, "photoshopped</w>": 35738, "embraced</w>": 35739, "fightfor</w>": 35740, "gorman</w>": 35741, "zzzz": 35742, "scholastic</w>": 35743, "crisps</w>": 35744, "teapo": 35745, "midnight": 35746, "gaine": 35747, "collier</w>": 35748, "sate</w>": 35749, "dette</w>": 35750, "åŃ": 35751, "imagine": 35752, "iff": 35753, "twili": 35754, "ification</w>": 35755, "teatro</w>": 35756, "norma</w>": 35757, "esur": 35758, "emergencies</w>": 35759, "riseup</w>": 35760, "ringer</w>": 35761, "hassle</w>": 35762, "caitlyn</w>": 35763, "tranquil</w>": 35764, "versa</w>": 35765, "seb": 35766, "overlook</w>": 35767, "gini</w>": 35768, "bogo</w>": 35769, "sere": 35770, "mayne</w>": 35771, "henrik</w>": 35772, "contaminated</w>": 35773, "rhapsody</w>": 35774, "proportion</w>": 35775, "wildatlanticway</w>": 35776, "âģ©.</w>": 35777, "organisers</w>": 35778, "trane</w>": 35779, "standard": 35780, "sperm</w>": 35781, "launcher</w>": 35782, "ricci": 35783, "herts</w>": 35784, "paperwork</w>": 35785, "showcased</w>": 35786, "meryl</w>": 35787, "pena</w>": 35788, "pimp": 35789, "disastrous</w>": 35790, "^.^</w>": 35791, "phara": 35792, "xis</w>": 35793, "frontal</w>": 35794, "swirl</w>": 35795, "spills</w>": 35796, "swagger</w>": 35797, "smartwatch</w>": 35798, "sizzling</w>": 35799, "saviour</w>": 35800, "catar": 35801, "bbcr</w>": 35802, "refurbishment</w>": 35803, "dris</w>": 35804, "citroen</w>": 35805, "absorb</w>": 35806, "patriotism</w>": 35807, "illeg": 35808, "chromo": 35809, "freshers</w>": 35810, "rus": 35811, "limiting</w>": 35812, "efish</w>": 35813, "downed</w>": 35814, "mandir</w>": 35815, "hazelnut</w>": 35816, "pall": 35817, "macon</w>": 35818, "disappearing</w>": 35819, "qualifies</w>": 35820, "boon</w>": 35821, "barracks</w>": 35822, "amine</w>": 35823, "gendere": 35824, "ðŁļĺ</w>": 35825, "jes</w>": 35826, "ãĥŃ": 35827, "quito</w>": 35828, "middleweight</w>": 35829, "schau": 35830, "quadru": 35831, "aciones</w>": 35832, "limitless</w>": 35833, "ðŁĳĮðŁı½</w>": 35834, "chman</w>": 35835, "arav": 35836, "regulators</w>": 35837, "itup</w>": 35838, "battersea</w>": 35839, "milford</w>": 35840, "gz</w>": 35841, "ticking</w>": 35842, "ghou": 35843, "crushes</w>": 35844, "tutu</w>": 35845, "dreadful</w>": 35846, "famine</w>": 35847, "forchange</w>": 35848, "dalailama</w>": 35849, "ðŁĴį": 35850, "whitaker</w>": 35851, "hashmi</w>": 35852, "hus</w>": 35853, "vod</w>": 35854, "bette</w>": 35855, "aaah</w>": 35856, "isoo</w>": 35857, "ðŁ¥Ī</w>": 35858, "haar</w>": 35859, "laine</w>": 35860, "bv</w>": 35861, "allday</w>": 35862, "sprout</w>": 35863, "indiegames</w>": 35864, "freebie</w>": 35865, "greeks</w>": 35866, "butler": 35867, "illin</w>": 35868, "haal</w>": 35869, "wareness</w>": 35870, "sima</w>": 35871, "publichealth</w>": 35872, "gama</w>": 35873, "waa</w>": 35874, "oung": 35875, "goooo": 35876, "okinawa</w>": 35877, "offenders</w>": 35878, "impose</w>": 35879, "hoc</w>": 35880, "youngster</w>": 35881, "storyteller</w>": 35882, "scap": 35883, "fighter": 35884, "+,</w>": 35885, "whites": 35886, "musicmonday</w>": 35887, "reza</w>": 35888, "goducks</w>": 35889, "bria</w>": 35890, "mium</w>": 35891, "casper</w>": 35892, "crumbs</w>": 35893, "aad</w>": 35894, "martialarts</w>": 35895, "chp</w>": 35896, "rigged</w>": 35897, "tng</w>": 35898, "harvested</w>": 35899, "sak</w>": 35900, "dojo</w>": 35901, "millwall</w>": 35902, "bnw</w>": 35903, "ocd</w>": 35904, "historyof": 35905, "tmr</w>": 35906, "sirens</w>": 35907, "fanci": 35908, "caregivers</w>": 35909, "vira</w>": 35910, "soni</w>": 35911, "recurring</w>": 35912, "acknowledged</w>": 35913, "ðŁıŁ": 35914, "ophile</w>": 35915, "bucky</w>": 35916, "stressing</w>": 35917, "rook</w>": 35918, "digger</w>": 35919, "vival": 35920, "sando": 35921, "fleet": 35922, "siers</w>": 35923, "selcaday</w>": 35924, "refreshed</w>": 35925, "antifa</w>": 35926, "aque": 35927, "polo": 35928, "disappearance</w>": 35929, "demb": 35930, "âĮļï¸ı</w>": 35931, "rented</w>": 35932, "berger": 35933, "gmb": 35934, "cula</w>": 35935, "ssal</w>": 35936, "goody</w>": 35937, "uhh</w>": 35938, "marcelo</w>": 35939, "wanna": 35940, "software": 35941, "shopsmall</w>": 35942, "turtle": 35943, "tomas": 35944, "frisco</w>": 35945, "ðŁĺįðŁĴķ</w>": 35946, "jimenez</w>": 35947, "csu": 35948, "dayz</w>": 35949, "ando": 35950, "wynne</w>": 35951, "choreographer</w>": 35952, "cervical</w>": 35953, "trailblazers</w>": 35954, "edg": 35955, "zendaya</w>": 35956, "travelblog</w>": 35957, "els": 35958, "wholesome</w>": 35959, "cog</w>": 35960, "labout": 35961, "arney</w>": 35962, "delle</w>": 35963, "suisse</w>": 35964, "masi": 35965, "inese</w>": 35966, "ombe</w>": 35967, "fiddle</w>": 35968, "reclaim</w>": 35969, "pau</w>": 35970, "watcher</w>": 35971, "slain</w>": 35972, "berty</w>": 35973, "optimum</w>": 35974, "elites</w>": 35975, "minis</w>": 35976, "turkey": 35977, "patrols</w>": 35978, "gerard": 35979, "aureli": 35980, "wildly</w>": 35981, "waltz</w>": 35982, "brgy</w>": 35983, "wob": 35984, "crest": 35985, "+++</w>": 35986, "vez": 35987, "frosted</w>": 35988, "davido</w>": 35989, "thex": 35990, "paramedics</w>": 35991, "pinto</w>": 35992, "hank": 35993, "dupont</w>": 35994, "urg</w>": 35995, "fostering</w>": 35996, "micropoetry</w>": 35997, "spectre</w>": 35998, "----></w>": 35999, "neuro</w>": 36000, "frida</w>": 36001, "musical": 36002, "galveston</w>": 36003, "effic": 36004, "scape": 36005, "palazzo</w>": 36006, "thall</w>": 36007, "provisional</w>": 36008, "pjs</w>": 36009, "aure": 36010, "ðŁĶľ</w>": 36011, "mamamoo</w>": 36012, "kitties</w>": 36013, "cree</w>": 36014, "wak</w>": 36015, "loool</w>": 36016, "lupus</w>": 36017, "cnblue</w>": 36018, "Ãº</w>": 36019, "ðŁİ¬": 36020, "raced</w>": 36021, "trose</w>": 36022, "omas</w>": 36023, "stride</w>": 36024, "coors</w>": 36025, "â¤µï¸ı</w>": 36026, "incomparable</w>": 36027, "cyril</w>": 36028, "broader</w>": 36029, "areclipse</w>": 36030, "ðŁįĶ</w>": 36031, "interval</w>": 36032, "tiru": 36033, "coworking</w>": 36034, "waco</w>": 36035, "aham": 36036, "abee</w>": 36037, "flourish</w>": 36038, "thetimes</w>": 36039, "olini</w>": 36040, "kickboxing</w>": 36041, "lucer": 36042, "atla": 36043, "asun": 36044, "casserole</w>": 36045, "miaw</w>": 36046, "lobbying</w>": 36047, "janice</w>": 36048, "cirque</w>": 36049, "reflex</w>": 36050, "leary</w>": 36051, "sanatomy</w>": 36052, "tempest</w>": 36053, "semb": 36054, "murdering</w>": 36055, "usav": 36056, "robo</w>": 36057, "onet</w>": 36058, "pcc</w>": 36059, "natives</w>": 36060, "lifeof": 36061, "saha": 36062, "ruthless</w>": 36063, "relates</w>": 36064, "appetizer</w>": 36065, "pyeongchang</w>": 36066, "nord</w>": 36067, "eru": 36068, "athing</w>": 36069, "ugly": 36070, "plying</w>": 36071, "brance": 36072, "organise</w>": 36073, "kendra</w>": 36074, "dato</w>": 36075, "cheeses</w>": 36076, "parma</w>": 36077, "burnout</w>": 36078, "astra</w>": 36079, "pretoria</w>": 36080, "adjustment</w>": 36081, "uku</w>": 36082, "slo</w>": 36083, "liken": 36084, "favors</w>": 36085, "clive": 36086, "beets</w>": 36087, "snowdonia</w>": 36088, "gotv</w>": 36089, "syn</w>": 36090, "openhouse</w>": 36091, "pani</w>": 36092, "portrayed</w>": 36093, "slated</w>": 36094, "mecca</w>": 36095, "renal</w>": 36096, "supportsmallstreamers</w>": 36097, "staffs</w>": 36098, "dao</w>": 36099, "biker": 36100, "viktor</w>": 36101, "titus</w>": 36102, "admired</w>": 36103, "ðŁĵ±": 36104, "hurrican": 36105, "heats</w>": 36106, "glory": 36107, "photogenic</w>": 36108, "meri</w>": 36109, "depor": 36110, "burnham</w>": 36111, "orangu": 36112, "djing</w>": 36113, "impressionism</w>": 36114, "ignition</w>": 36115, "cai</w>": 36116, "wynn</w>": 36117, "depe": 36118, "coveted</w>": 36119, "collagen</w>": 36120, "saus</w>": 36121, "ornam": 36122, "administrators</w>": 36123, "sson": 36124, "nhpolitics</w>": 36125, "hahahahahahahaha": 36126, "aspirations</w>": 36127, "rgb</w>": 36128, "swollen</w>": 36129, "sowe": 36130, "scr</w>": 36131, "divergent</w>": 36132, "houghton</w>": 36133, "hanoi</w>": 36134, "dory</w>": 36135, "niki</w>": 36136, "landry</w>": 36137, "bcci</w>": 36138, "ðŁĳĮðŁĳĮ": 36139, "ismail</w>": 36140, "tripod</w>": 36141, "herd": 36142, "bhatt</w>": 36143, "dressage</w>": 36144, "tabby</w>": 36145, "inguish</w>": 36146, "huron</w>": 36147, "à³į</w>": 36148, "Ãł": 36149, "todas</w>": 36150, "evangelical</w>": 36151, "chords</w>": 36152, "stjohn": 36153, "sloppy</w>": 36154, "martyr": 36155, "facebook": 36156, "alight</w>": 36157, "sensei</w>": 36158, "kathniel</w>": 36159, "rites</w>": 36160, "zione</w>": 36161, "uo</w>": 36162, "revelations</w>": 36163, "weightlifting</w>": 36164, "pano</w>": 36165, "ncwx</w>": 36166, "acton": 36167, "à®ķ</w>": 36168, "Ø²": 36169, "soma</w>": 36170, "à¸Ĺ</w>": 36171, "respecting</w>": 36172, "marche</w>": 36173, "foreman</w>": 36174, "betty": 36175, "kik</w>": 36176, "shibu": 36177, "poon</w>": 36178, "argyle</w>": 36179, "kswx</w>": 36180, "etz": 36181, "marbella</w>": 36182, "brackets</w>": 36183, "standby</w>": 36184, "fireside</w>": 36185, "defiance</w>": 36186, "vex": 36187, "britannia</w>": 36188, "inhabit": 36189, "appoint</w>": 36190, "piyush": 36191, "leash</w>": 36192, "sciento": 36193, "flask</w>": 36194, "senna</w>": 36195, ">:</w>": 36196, "atroc": 36197, "sanderson</w>": 36198, "idlib</w>": 36199, "dhanush</w>": 36200, "ðŁĺĻ": 36201, "enthr": 36202, "hitch</w>": 36203, "dedly</w>": 36204, "alley": 36205, "dork</w>": 36206, "mondo</w>": 36207, "cuddly</w>": 36208, "missin</w>": 36209, "yesss</w>": 36210, "nighting": 36211, "jpn</w>": 36212, "wary</w>": 36213, "umpire</w>": 36214, "maz</w>": 36215, "ê³": 36216, "babs</w>": 36217, "ĭãģ": 36218, "stanford": 36219, "possessed</w>": 36220, "exceeded</w>": 36221, "ðŁĶ¶</w>": 36222, "wallart</w>": 36223, "trap": 36224, "jil": 36225, "hibis": 36226, "spying</w>": 36227, "scribe</w>": 36228, "khalil</w>": 36229, "translator</w>": 36230, "lumb": 36231, "dized</w>": 36232, "chc</w>": 36233, "supervision</w>": 36234, "shutter": 36235, "jag</w>": 36236, "_*</w>": 36237, "yesterdays</w>": 36238, "msf</w>": 36239, "hihi</w>": 36240, "gonzaga</w>": 36241, "gillespie</w>": 36242, "vivek</w>": 36243, "ecstatic</w>": 36244, "thismorning</w>": 36245, "chus</w>": 36246, "edes</w>": 36247, "stoned</w>": 36248, "bees": 36249, "ðŁĩ¹ðŁĩ": 36250, "turin</w>": 36251, "hover": 36252, "atrics</w>": 36253, "stern": 36254, "samheughan</w>": 36255, "autism": 36256, "miya</w>": 36257, "eyewitness</w>": 36258, "writings</w>": 36259, "traveltips</w>": 36260, "chutney</w>": 36261, "pxrtg</w>": 36262, "kenyans</w>": 36263, "mystic": 36264, "krit": 36265, "/$</w>": 36266, "redhead</w>": 36267, "worldly</w>": 36268, "amus</w>": 36269, "opla": 36270, "leve": 36271, "gabbana</w>": 36272, "seen": 36273, "oclock</w>": 36274, "ganga</w>": 36275, "keenan</w>": 36276, "scent": 36277, "oldies</w>": 36278, "gogreen</w>": 36279, "cornerstone</w>": 36280, "comply</w>": 36281, "concours</w>": 36282, "ðŁİ¶ðŁİ¶</w>": 36283, "haan</w>": 36284, "confis": 36285, "awson</w>": 36286, "cleop": 36287, "îĢ": 36288, "suzu": 36289, "sautÃ©": 36290, "algar": 36291, "subscriber</w>": 36292, "esteemed</w>": 36293, "ãĤ¤ãĥ": 36294, "worthwhile</w>": 36295, "melrose</w>": 36296, "flock": 36297, "brightly</w>": 36298, "violinist</w>": 36299, "pere</w>": 36300, "slipping</w>": 36301, "andco</w>": 36302, "sigh": 36303, "havan</w>": 36304, "culo": 36305, "msa</w>": 36306, "fibrosis</w>": 36307, "matilda</w>": 36308, "rafting</w>": 36309, "award": 36310, "ëª": 36311, "mmmm": 36312, "geaux": 36313, "steiner</w>": 36314, "sinn</w>": 36315, "helpers</w>": 36316, "beetles</w>": 36317, "aimee</w>": 36318, "taiwan": 36319, "pistachio</w>": 36320, "macbeth</w>": 36321, "mzan": 36322, "descendants</w>": 36323, "onsale</w>": 36324, "inr</w>": 36325, "ilm</w>": 36326, "grouse</w>": 36327, "saig": 36328, "mow</w>": 36329, "bigre": 36330, "adjustments</w>": 36331, "tula</w>": 36332, "mathew</w>": 36333, "translates</w>": 36334, "muh": 36335, "bollah</w>": 36336, "ðŁĴĽðŁĴĻ</w>": 36337, "amores</w>": 36338, "abouts</w>": 36339, "bombshell</w>": 36340, "blaster</w>": 36341, "xavi": 36342, "sns</w>": 36343, "kroger</w>": 36344, "gather": 36345, "eradic": 36346, "daft</w>": 36347, "chemo</w>": 36348, "benches</w>": 36349, "ðŁĩ©ðŁĩ": 36350, "utv</w>": 36351, "oura</w>": 36352, "nko": 36353, "gatorade</w>": 36354, "biafra</w>": 36355, "okstate</w>": 36356, "imdanielpadilla</w>": 36357, "domains</w>": 36358, "openingday</w>": 36359, "kiddo</w>": 36360, "doi</w>": 36361, "rice": 36362, "daycare</w>": 36363, "macmillan": 36364, "bathurst</w>": 36365, "cheerleading</w>": 36366, "ðŁ¦ģ": 36367, "cashback</w>": 36368, "kwon</w>": 36369, "hobbies</w>": 36370, "exempl": 36371, "riesling</w>": 36372, "âļª</w>": 36373, "agles": 36374, "nys": 36375, "everything": 36376, "navis</w>": 36377, "addi</w>": 36378, "magnesium</w>": 36379, "facelift</w>": 36380, "arkham</w>": 36381, "grandes</w>": 36382, "extremist</w>": 36383, "donat": 36384, "vitality</w>": 36385, "pumpkin": 36386, "betta</w>": 36387, "sltd</w>": 36388, "artisan": 36389, "liby": 36390, "peaked</w>": 36391, "ahhhhh</w>": 36392, "maryam</w>": 36393, "assim</w>": 36394, "unsc</w>": 36395, "mente</w>": 36396, "alaya</w>": 36397, "lowers</w>": 36398, "aras</w>": 36399, "griev": 36400, "leip": 36401, "grati": 36402, "crises</w>": 36403, "sprints</w>": 36404, "execute</w>": 36405, "wto</w>": 36406, "msd</w>": 36407, "magical": 36408, "reviewer</w>": 36409, "sparkles</w>": 36410, "jukebox</w>": 36411, "ðŁĺĤâĿ¤ï¸ı</w>": 36412, "payback</w>": 36413, "licenses</w>": 36414, "dunkin</w>": 36415, "belt": 36416, "lakewood</w>": 36417, "hateful</w>": 36418, "budgets</w>": 36419, "revamped</w>": 36420, "pherson</w>": 36421, "kyiv</w>": 36422, "wentworth</w>": 36423, "rosen</w>": 36424, "cruise": 36425, "giggle</w>": 36426, "defstar</w>": 36427, "assassinscre": 36428, "ymouth</w>": 36429, "winkle</w>": 36430, "wfc</w>": 36431, "bandwagon</w>": 36432, "bkk</w>": 36433, "wiring</w>": 36434, "kearney</w>": 36435, "southside</w>": 36436, "petit": 36437, "!ðŁĺį</w>": 36438, "nordic": 36439, "mirza</w>": 36440, "mugabe</w>": 36441, "vl</w>": 36442, "scones</w>": 36443, "ktv": 36444, "sandal": 36445, "duc</w>": 36446, "malls</w>": 36447, "ðŁĴŀðŁĴŀ</w>": 36448, "itc</w>": 36449, "alay</w>": 36450, "impair": 36451, "unrest</w>": 36452, "floss</w>": 36453, "cÃ©": 36454, "abou": 36455, "varying</w>": 36456, "museo</w>": 36457, "server": 36458, "diya</w>": 36459, "hibiscus</w>": 36460, "eroy</w>": 36461, "merritt</w>": 36462, "findom</w>": 36463, "fpp</w>": 36464, "unusually</w>": 36465, "gott</w>": 36466, "contingent</w>": 36467, "aliaa</w>": 36468, "ballon</w>": 36469, "jol</w>": 36470, "hiked</w>": 36471, "zyme</w>": 36472, "ayr</w>": 36473, "agn": 36474, "gaz</w>": 36475, "periodic</w>": 36476, "sparty</w>": 36477, "practising</w>": 36478, "linton</w>": 36479, "talis": 36480, "cypri": 36481, "womaninbiz</w>": 36482, "radiodisney</w>": 36483, "ðŁĮ¼": 36484, "jumpers</w>": 36485, "endocr": 36486, "ðŁļ¨ðŁļ¨": 36487, "andon</w>": 36488, "sharapo": 36489, "mier</w>": 36490, "masonic</w>": 36491, "factories</w>": 36492, "vien": 36493, "bbers</w>": 36494, "ìĽĲ</w>": 36495, "hold": 36496, "kebab</w>": 36497, "beak</w>": 36498, "approached</w>": 36499, "acmilan</w>": 36500, "munro</w>": 36501, "kosher</w>": 36502, "excellency</w>": 36503, "negotiation</w>": 36504, "waltdisneyworld</w>": 36505, "crouch</w>": 36506, "teasing</w>": 36507, "suppression</w>": 36508, "enya</w>": 36509, "bce</w>": 36510, "transformationtuesday</w>": 36511, "callie</w>": 36512, "viswas": 36513, "pgat": 36514, "icted</w>": 36515, "endings</w>": 36516, "escu": 36517, "recruited</w>": 36518, "itfc</w>": 36519, "collaborations</w>": 36520, "gino</w>": 36521, "snuck</w>": 36522, "auschwitz</w>": 36523, "ifc</w>": 36524, "xii</w>": 36525, "kesha</w>": 36526, "gervais</w>": 36527, "cloak</w>": 36528, "xl": 36529, "saad</w>": 36530, "probation</w>": 36531, "precau": 36532, "macin": 36533, "anastasi": 36534, "lek</w>": 36535, "eazy</w>": 36536, "daysofcode</w>": 36537, "mariahcarey</w>": 36538, "yog</w>": 36539, "stitched</w>": 36540, "boyfriends</w>": 36541, "shar</w>": 36542, "phile</w>": 36543, "agu</w>": 36544, "twinkle</w>": 36545, "phishing</w>": 36546, "weekender</w>": 36547, "icton</w>": 36548, "gurmeetramrahim</w>": 36549, "alton</w>": 36550, "leness</w>": 36551, "allan": 36552, "penultimate</w>": 36553, "krystal</w>": 36554, "gou</w>": 36555, "lande</w>": 36556, "dismant": 36557, "abusing</w>": 36558, "norse</w>": 36559, "paterson</w>": 36560, "edmun": 36561, "apan</w>": 36562, "xiumin</w>": 36563, "skel": 36564, "catwalk</w>": 36565, "react": 36566, "walled</w>": 36567, "tangle</w>": 36568, "bryn": 36569, "veto</w>": 36570, "supermoon</w>": 36571, "casablanc": 36572, "appreciates</w>": 36573, "skid": 36574, "both": 36575, "catalina</w>": 36576, "eleague</w>": 36577, "cybermonday</w>": 36578, "cautious</w>": 36579, "ðŁ¤ĵ": 36580, "novo</w>": 36581, "hampton": 36582, "haye": 36583, "josef</w>": 36584, "varan": 36585, "lobos</w>": 36586, "roanoke</w>": 36587, "orphans</w>": 36588, "ttin</w>": 36589, "squads</w>": 36590, "ishqbaaaz</w>": 36591, "blackpanther</w>": 36592, "etu": 36593, "ksh</w>": 36594, "crumble</w>": 36595, "cessna</w>": 36596, "relieved</w>": 36597, "scully</w>": 36598, "pollinators</w>": 36599, "explorecanada</w>": 36600, "kies": 36601, "kamloops</w>": 36602, "kiran</w>": 36603, "primal</w>": 36604, "settlements</w>": 36605, "hotspot</w>": 36606, "brainstorming</w>": 36607, "cedric</w>": 36608, "biennial</w>": 36609, "shant": 36610, "âĻ¡âĻ¡âĻ¡</w>": 36611, "doon</w>": 36612, "hearn</w>": 36613, "walkway</w>": 36614, "fem</w>": 36615, "veal</w>": 36616, "deportation</w>": 36617, "toxins</w>": 36618, "eliminating</w>": 36619, "descending</w>": 36620, "bythe": 36621, "blasphe": 36622, "hasta</w>": 36623, "complement</w>": 36624, "ascent</w>": 36625, "riga</w>": 36626, "provost</w>": 36627, "âĸª</w>": 36628, "weeping</w>": 36629, "antisemitism</w>": 36630, "employee": 36631, "unearthed</w>": 36632, "pino</w>": 36633, "natalie": 36634, "blad</w>": 36635, "angola</w>": 36636, "lockheed</w>": 36637, "inian</w>": 36638, "agr": 36639, "nister</w>": 36640, "impala</w>": 36641, "mke</w>": 36642, "fanatic</w>": 36643, "âĺħâĺħ</w>": 36644, "ðŁĳ¸</w>": 36645, "luch": 36646, "simplified</w>": 36647, "gallery": 36648, "economic": 36649, "cyborg</w>": 36650, "coni</w>": 36651, "selma</w>": 36652, "inception</w>": 36653, "koala</w>": 36654, "dvds</w>": 36655, "crested</w>": 36656, "mmor": 36657, "visible": 36658, "nsd</w>": 36659, "ðŁĻĮðŁı½": 36660, "wunder": 36661, "refrigerator</w>": 36662, "reopening</w>": 36663, "eera</w>": 36664, "carousel</w>": 36665, "asp</w>": 36666, "ballistic</w>": 36667, "victory": 36668, "motive</w>": 36669, "trey": 36670, "sharapova</w>": 36671, "sii</w>": 36672, "monter": 36673, "intend</w>": 36674, "westchester</w>": 36675, "spe</w>": 36676, "cymb": 36677, "vidal</w>": 36678, "llama</w>": 36679, "univ": 36680, "finer</w>": 36681, "craftsmanship</w>": 36682, "jazzfest</w>": 36683, "bch</w>": 36684, "aggio</w>": 36685, "ncc</w>": 36686, "lambda</w>": 36687, "tranquility</w>": 36688, "cisco": 36689, "baden</w>": 36690, "sobbing</w>": 36691, "ofi": 36692, "gota</w>": 36693, "rumored</w>": 36694, "warmed</w>": 36695, "orean</w>": 36696, "acton</w>": 36697, "marci": 36698, "ghani</w>": 36699, "âľĵ</w>": 36700, "assorted</w>": 36701, "pembroke": 36702, "penelope</w>": 36703, "daf</w>": 36704, "atty</w>": 36705, "aimo</w>": 36706, "pretzel</w>": 36707, "carnival": 36708, "thanos</w>": 36709, "kochi</w>": 36710, "mersal</w>": 36711, "hamradio</w>": 36712, "artwit</w>": 36713, "casc": 36714, "guerrilla</w>": 36715, "kushner</w>": 36716, "kapp": 36717, "alise</w>": 36718, "toddlers</w>": 36719, "stewardship</w>": 36720, "otti</w>": 36721, "terri</w>": 36722, "tempe</w>": 36723, "restless</w>": 36724, "vito</w>": 36725, "zayed</w>": 36726, "rspb</w>": 36727, "pion": 36728, "hippo</w>": 36729, "hawthorne</w>": 36730, "inas": 36731, "amily</w>": 36732, "nutcracker</w>": 36733, "lop": 36734, "dali": 36735, "tropic</w>": 36736, "ðŁ¤ł</w>": 36737, "ulo</w>": 36738, "jaredle": 36739, "pyrene": 36740, "paleo": 36741, "usair": 36742, "mould</w>": 36743, "itated</w>": 36744, "genetically</w>": 36745, "biomass</w>": 36746, "ðŁĩ³ðŁĩ±</w>": 36747, "dodd</w>": 36748, "practiced</w>": 36749, "monarchs</w>": 36750, "unmanned</w>": 36751, "mbuhari</w>": 36752, "amal</w>": 36753, "photogra": 36754, "kool": 36755, "brendon</w>": 36756, "juices</w>": 36757, "cure": 36758, "worldbank</w>": 36759, "pointers</w>": 36760, "ðŁĴĿ": 36761, "turf": 36762, "leds</w>": 36763, "borussia</w>": 36764, "baptism</w>": 36765, "warwickshire</w>": 36766, "mounts</w>": 36767, "gayo</w>": 36768, "begg": 36769, "copied</w>": 36770, "asians</w>": 36771, "kg": 36772, "modernist</w>": 36773, "gid": 36774, "frontman</w>": 36775, "concentrated</w>": 36776, "yt": 36777, "scavenger</w>": 36778, "ironically</w>": 36779, "adic</w>": 36780, "psn</w>": 36781, "ðŁ¥ī</w>": 36782, "culturally</w>": 36783, "yuv": 36784, "macarthur</w>": 36785, "fertilizer</w>": 36786, "bewithyou</w>": 36787, "rigor": 36788, "minors</w>": 36789, "zoning</w>": 36790, "âĸł</w>": 36791, "rir</w>": 36792, "adolescent</w>": 36793, "vinny</w>": 36794, "reng</w>": 36795, "sandstone</w>": 36796, "guet</w>": 36797, "westh": 36798, "pledged</w>": 36799, "laced</w>": 36800, "spide": 36801, "vai</w>": 36802, "tycoon</w>": 36803, "seizure</w>": 36804, "dup": 36805, "appalachian</w>": 36806, "rok</w>": 36807, "catholics</w>": 36808, "seychel": 36809, "possess</w>": 36810, "lager": 36811, "jodi": 36812, "champ": 36813, "stras": 36814, "dina</w>": 36815, "centuri": 36816, "calder</w>": 36817, "bluray</w>": 36818, "ðŁĩ¨ðŁĩ³</w>": 36819, "modo</w>": 36820, "annette</w>": 36821, "youtubers</w>": 36822, "chaps</w>": 36823, "angling</w>": 36824, "labeling</w>": 36825, "aqui": 36826, "pkwy</w>": 36827, "lyle</w>": 36828, "bisexual</w>": 36829, "litur": 36830, "dugout</w>": 36831, "libby</w>": 36832, "greysanatomy</w>": 36833, "substances</w>": 36834, "augustus</w>": 36835, "rallying</w>": 36836, "fidel</w>": 36837, "ingue</w>": 36838, "äºº": 36839, "hallmarkchannel</w>": 36840, "toothbrush</w>": 36841, "mÃ¡": 36842, "adirond": 36843, "aggi": 36844, "ðŁĵį:</w>": 36845, "crusade</w>": 36846, "taxation</w>": 36847, "kz</w>": 36848, "iver": 36849, "doubling</w>": 36850, "roomie</w>": 36851, "wab</w>": 36852, "enrolled</w>": 36853, "azon</w>": 36854, "aju": 36855, "grandchildren</w>": 36856, "asdf": 36857, "ðŁ¥º</w>": 36858, "matic": 36859, "oughton</w>": 36860, "utilize</w>": 36861, "ðŁĴ£": 36862, "ponder</w>": 36863, "raisin</w>": 36864, "dysfunction</w>": 36865, "cobain</w>": 36866, "butternut</w>": 36867, "eman</w>": 36868, "sured</w>": 36869, "drian</w>": 36870, "andfriends</w>": 36871, "withthe": 36872, "onomy</w>": 36873, "heineken</w>": 36874, "bridal": 36875, "leadership": 36876, "pyramids</w>": 36877, "deutschland</w>": 36878, "jocel": 36879, "bowel</w>": 36880, "yqr</w>": 36881, "horsepower</w>": 36882, "beacon": 36883, "ingeni": 36884, "gradient</w>": 36885, "fermented</w>": 36886, "moom": 36887, "thingy</w>": 36888, "potassi": 36889, "wristband</w>": 36890, "bord</w>": 36891, "bodied</w>": 36892, "ðŁĺŃðŁĺį</w>": 36893, "mapp</w>": 36894, "kau</w>": 36895, "cyberpunk</w>": 36896, "phish</w>": 36897, "looking": 36898, "coates</w>": 36899, "apur": 36900, "amie</w>": 36901, "uklabour</w>": 36902, "atin": 36903, "gla</w>": 36904, "adoptable</w>": 36905, "shelby": 36906, "villi": 36907, "riya</w>": 36908, "mingly</w>": 36909, "climber</w>": 36910, "bumblebee</w>": 36911, "ðŁĺ¸</w>": 36912, "csd</w>": 36913, "âĿ¥</w>": 36914, "hospitalized</w>": 36915, "cki": 36916, "hater</w>": 36917, "chr</w>": 36918, "retina</w>": 36919, "ita": 36920, "fanbase</w>": 36921, "beatrice</w>": 36922, "gwyne": 36923, "goss</w>": 36924, "fos": 36925, "favorited</w>": 36926, "swachhbharat</w>": 36927, "malade</w>": 36928, "monmouth</w>": 36929, "\"[</w>": 36930, "sivan</w>": 36931, "shhh</w>": 36932, "commanding</w>": 36933, "sainsburys</w>": 36934, "weed": 36935, "gman</w>": 36936, "ssw</w>": 36937, "reptile</w>": 36938, "ivy": 36939, "tropics</w>": 36940, "rollers</w>": 36941, "overcast</w>": 36942, "exposition</w>": 36943, "masquerade</w>": 36944, "mancrush": 36945, "waist": 36946, "sprinter</w>": 36947, "sleet</w>": 36948, "levin</w>": 36949, "jpg</w>": 36950, "_(</w>": 36951, "opel</w>": 36952, "exploit</w>": 36953, "apa": 36954, "powe": 36955, "wrecking</w>": 36956, "jongin</w>": 36957, "orb</w>": 36958, "erick</w>": 36959, "bosco</w>": 36960, "praising</w>": 36961, "bertr": 36962, "towing</w>": 36963, "insecurity</w>": 36964, "kut</w>": 36965, "restocked</w>": 36966, "rrp</w>": 36967, "prescribed</w>": 36968, "trafalgar</w>": 36969, "pert": 36970, "gases</w>": 36971, "apprais": 36972, "ghar</w>": 36973, "musicals</w>": 36974, "âĸ¬âĸ¬": 36975, "mcfad": 36976, "agony</w>": 36977, "condition": 36978, "equip</w>": 36979, "shik</w>": 36980, "atravel</w>": 36981, "ðŁĩ¿ðŁĩ¦</w>": 36982, "keh</w>": 36983, "abduction</w>": 36984, "peoria</w>": 36985, "wilkins</w>": 36986, "gms</w>": 36987, "asd</w>": 36988, "evi</w>": 36989, "ðŁĴĹðŁĴĹðŁĴĹ</w>": 36990, "uz</w>": 36991, "moc</w>": 36992, "hallelujah</w>": 36993, "guadalu": 36994, "louvre</w>": 36995, "drawing": 36996, "gove</w>": 36997, "phant": 36998, "frie": 36999, "webdev</w>": 37000, "programmer</w>": 37001, "zable</w>": 37002, "gamescom</w>": 37003, "clarify</w>": 37004, "lith": 37005, "kinky</w>": 37006, "âĿ£": 37007, "labourdoorstep</w>": 37008, "sonata</w>": 37009, "juris": 37010, "maiden": 37011, "viadu": 37012, "bucharest</w>": 37013, "conditioned</w>": 37014, "capitalist</w>": 37015, "ude": 37016, "psb</w>": 37017, "spca</w>": 37018, "lulla": 37019, "foothills</w>": 37020, "kayo</w>": 37021, "bond": 37022, "womb</w>": 37023, "rounder</w>": 37024, "cesar": 37025, "bursts</w>": 37026, "apra": 37027, "swoon</w>": 37028, "sabrin": 37029, "fragrant</w>": 37030, "clearer</w>": 37031, "kubrick</w>": 37032, "climax</w>": 37033, "journo</w>": 37034, "agle": 37035, "ðŁı½âĢįâĻĢï¸ı</w>": 37036, "pooch</w>": 37037, "hale": 37038, "solit": 37039, "salmon": 37040, "organisms</w>": 37041, "bronson</w>": 37042, "arten</w>": 37043, "hodgson</w>": 37044, "alove</w>": 37045, "venture": 37046, "bbi</w>": 37047, "aea</w>": 37048, "ðŁĲ¢</w>": 37049, "ldn": 37050, "dnr</w>": 37051, "ozone</w>": 37052, "ellas</w>": 37053, "manny": 37054, "azzur": 37055, "unbeat": 37056, "truffles</w>": 37057, "thong</w>": 37058, "maÃ±": 37059, "lasers</w>": 37060, "leye</w>": 37061, "gettysburg</w>": 37062, "backpacks</w>": 37063, "oris</w>": 37064, "maison": 37065, "crawling</w>": 37066, "labra": 37067, "cling": 37068, "dragging</w>": 37069, "steal": 37070, "doubt": 37071, "devan": 37072, "ckers</w>": 37073, "agentsof": 37074, "photobomb</w>": 37075, "elonmusk</w>": 37076, "aboy</w>": 37077, "distances</w>": 37078, "storyline</w>": 37079, "spi</w>": 37080, "northan": 37081, "europeans</w>": 37082, "whale": 37083, "serpent": 37084, "ðŁļ²</w>": 37085, "fior": 37086, "trit": 37087, "oxo</w>": 37088, "awarding</w>": 37089, "classmate</w>": 37090, "sufc</w>": 37091, "smartest</w>": 37092, "riches</w>": 37093, "prk</w>": 37094, "bigfoot</w>": 37095, "armb": 37096, "bipolar</w>": 37097, "dwelling</w>": 37098, "omars</w>": 37099, "kwan": 37100, "grime</w>": 37101, "meng": 37102, "frederick": 37103, "navarro</w>": 37104, "sorrynotsorry</w>": 37105, "jaredleto</w>": 37106, "pave</w>": 37107, "slack": 37108, "barnsley": 37109, "attar</w>": 37110, "eviction</w>": 37111, "accumulation</w>": 37112, "oir</w>": 37113, "catchy</w>": 37114, "welter": 37115, "vikas</w>": 37116, "hassee</w>": 37117, "nikita</w>": 37118, "moyes</w>": 37119, "mathews</w>": 37120, "shiv</w>": 37121, "gatwick</w>": 37122, "profiling</w>": 37123, "companions</w>": 37124, "marrake": 37125, "antics</w>": 37126, "ðŁĻĮðŁĻĮðŁĻĮ</w>": 37127, "sese</w>": 37128, "boi": 37129, "bartlett</w>": 37130, "poisonous</w>": 37131, "abuses</w>": 37132, "ymm</w>": 37133, "kampala</w>": 37134, "guggenheim</w>": 37135, "imvkohli</w>": 37136, "dolom": 37137, "bree</w>": 37138, "throttle</w>": 37139, "gareth": 37140, "fitzpatrick</w>": 37141, "unya</w>": 37142, "parad": 37143, "margot</w>": 37144, "jnr</w>": 37145, "wea": 37146, "potassium</w>": 37147, "pnc</w>": 37148, "disguised</w>": 37149, "crash": 37150, "renergy</w>": 37151, "illic": 37152, "coupled</w>": 37153, "niels</w>": 37154, "ciones</w>": 37155, "æĹ¥</w>": 37156, "iment</w>": 37157, "despicable</w>": 37158, "dye": 37159, "whatcha</w>": 37160, "connections</w>": 37161, "paralympics</w>": 37162, "gauntlet</w>": 37163, "waitrose</w>": 37164, "suicidal</w>": 37165, "starship</w>": 37166, "vapor": 37167, "stou": 37168, "lawmaker</w>": 37169, "cooled</w>": 37170, "simo</w>": 37171, "theno": 37172, "offroad</w>": 37173, "jaden</w>": 37174, "basque</w>": 37175, "vicky": 37176, "lukaku</w>": 37177, "centro</w>": 37178, "trish</w>": 37179, "strategist</w>": 37180, "medications</w>": 37181, "horst</w>": 37182, "bfc</w>": 37183, "grail</w>": 37184, "sharply</w>": 37185, "aditya</w>": 37186, "tomb": 37187, "kaufman</w>": 37188, "tripad": 37189, "samba</w>": 37190, "pastoral</w>": 37191, "britney": 37192, "sagan</w>": 37193, "hillside</w>": 37194, "masons</w>": 37195, "sara": 37196, "zone": 37197, "xu</w>": 37198, "totes</w>": 37199, "robbie": 37200, "appen": 37201, "montag": 37202, "dero": 37203, "shortfilm</w>": 37204, "charismatic</w>": 37205, "tators</w>": 37206, "kiba": 37207, "andri": 37208, "alarming</w>": 37209, "splitting</w>": 37210, "icar": 37211, "thug": 37212, "scariest</w>": 37213, "sylvester</w>": 37214, "anan": 37215, "utrecht</w>": 37216, "adifference</w>": 37217, "meade</w>": 37218, "buster": 37219, "airstrikes</w>": 37220, "cuffs</w>": 37221, "accountants</w>": 37222, "ðŁĺ¡ðŁĺ¡": 37223, "newt</w>": 37224, "bott</w>": 37225, "issuing</w>": 37226, "clancy</w>": 37227, "wwenetwork</w>": 37228, "kyuhyun</w>": 37229, "resemble</w>": 37230, "pajamas</w>": 37231, "sink": 37232, "kinney</w>": 37233, "sulph": 37234, "ork</w>": 37235, "lies": 37236, "lagh": 37237, "orton</w>": 37238, "rahul": 37239, "dsc</w>": 37240, "wewill": 37241, "ream": 37242, "colloqui": 37243, "sharia</w>": 37244, "hectic</w>": 37245, "sarcasm</w>": 37246, "lander": 37247, "tmz</w>": 37248, "endorf</w>": 37249, "roz</w>": 37250, "hammered</w>": 37251, "fris": 37252, "wadi</w>": 37253, "popefrancis</w>": 37254, "heit</w>": 37255, "flashlight</w>": 37256, "unborn</w>": 37257, "opes</w>": 37258, "holiness</w>": 37259, "ðŁĲ¦</w>": 37260, "nacht</w>": 37261, "imsa</w>": 37262, "gracing</w>": 37263, "bjp": 37264, "verts</w>": 37265, "csc</w>": 37266, "homeowner</w>": 37267, "aque</w>": 37268, "bigotry</w>": 37269, "annie": 37270, "bagh</w>": 37271, "âĿ¤ï¸ıðŁĺį</w>": 37272, "cari</w>": 37273, "thomp": 37274, "disposable</w>": 37275, "cardiology</w>": 37276, "patented</w>": 37277, "hhhhhh</w>": 37278, "ldr</w>": 37279, "stephenson</w>": 37280, "crores</w>": 37281, "fanning</w>": 37282, "climat": 37283, "ðŁĳįðŁĳįðŁĳį</w>": 37284, "ðŁĳįðŁı¼": 37285, "aeron": 37286, "piccadilly</w>": 37287, "bankrupt</w>": 37288, "silvia</w>": 37289, "employ": 37290, "donny": 37291, "commenting</w>": 37292, "screenwriter</w>": 37293, "iota</w>": 37294, "cean</w>": 37295, "ancers</w>": 37296, "tuan</w>": 37297, "streetwear</w>": 37298, "à¤¯</w>": 37299, "skine</w>": 37300, "espa": 37301, "asif</w>": 37302, "osce</w>": 37303, "sheppard</w>": 37304, "morecam": 37305, "bottle": 37306, "ders": 37307, "oracle": 37308, "googleplay</w>": 37309, "averaged</w>": 37310, "edmonton": 37311, "stephan</w>": 37312, "sisterhood</w>": 37313, "crusted</w>": 37314, "staggering</w>": 37315, "methodology</w>": 37316, "congresswoman</w>": 37317, "cabo": 37318, "triggers</w>": 37319, "milky": 37320, "glide</w>": 37321, "toothpaste</w>": 37322, "roommates</w>": 37323, "nuff</w>": 37324, "guam</w>": 37325, "sprinkles</w>": 37326, "alternative": 37327, "watfordfc</w>": 37328, "uoft</w>": 37329, "haley": 37330, "contacted</w>": 37331, "bundy</w>": 37332, "prostitu": 37333, "ghar": 37334, "preston": 37335, "onsite</w>": 37336, "hilar": 37337, "gts</w>": 37338, "catt": 37339, "hampstead</w>": 37340, "??!</w>": 37341, "ðŁĩ§ðŁĩ": 37342, "bbcqt</w>": 37343, "alessandro</w>": 37344, "resist": 37345, "maidan</w>": 37346, "tko</w>": 37347, "shading</w>": 37348, "pinup</w>": 37349, "gallo": 37350, "sinu": 37351, "atec": 37352, "funk": 37353, "aclu</w>": 37354, "strides</w>": 37355, "rhyme</w>": 37356, "wetland</w>": 37357, "bbcspringwatch</w>": 37358, "tins</w>": 37359, "wildcard</w>": 37360, "stour": 37361, "flamenco</w>": 37362, "paula": 37363, "ontology</w>": 37364, "gangsta</w>": 37365, "amade": 37366, "ãĤ«": 37367, "tbs</w>": 37368, "skeletal</w>": 37369, "runner": 37370, "jardin</w>": 37371, "harrier</w>": 37372, "hunted</w>": 37373, "zhen": 37374, "believeinfilm</w>": 37375, "demean": 37376, "auditi": 37377, "restart</w>": 37378, "chondri": 37379, "âĿ¤ï¸ıðŁĴĻ</w>": 37380, "mclaren": 37381, "gab</w>": 37382, "shum": 37383, "ausa</w>": 37384, "lewisham": 37385, "ypg</w>": 37386, "kjv</w>": 37387, "furnished</w>": 37388, "doro</w>": 37389, "bonded</w>": 37390, "morty</w>": 37391, "latitude</w>": 37392, "_)</w>": 37393, "lova</w>": 37394, "waterways</w>": 37395, "vinai": 37396, "shorth": 37397, "drunk": 37398, "cay</w>": 37399, "ayana</w>": 37400, "kaplan</w>": 37401, "cappuccino</w>": 37402, "spro</w>": 37403, "lifeboat</w>": 37404, "hasbro</w>": 37405, "spolice</w>": 37406, "toron": 37407, "doing": 37408, "damn": 37409, "shree</w>": 37410, "fountains</w>": 37411, "entation</w>": 37412, "maru": 37413, "boarder</w>": 37414, "topless</w>": 37415, "jada</w>": 37416, "channing</w>": 37417, "ulls</w>": 37418, "enclosure</w>": 37419, "gibson": 37420, "fractured</w>": 37421, "britton</w>": 37422, "Ã¶</w>": 37423, "tous</w>": 37424, "porth": 37425, "draf": 37426, "trailing</w>": 37427, "margate</w>": 37428, "elife": 37429, "downward</w>": 37430, "linn</w>": 37431, "glades</w>": 37432, "girlpower</w>": 37433, "akrish": 37434, "uki": 37435, "ronda</w>": 37436, "tsc</w>": 37437, "appreciationday</w>": 37438, "vising</w>": 37439, "loom": 37440, "ðŁį³</w>": 37441, "mexican": 37442, "argos</w>": 37443, "yya</w>": 37444, "jadine</w>": 37445, "southport</w>": 37446, "dend": 37447, "sista</w>": 37448, "redeem</w>": 37449, "meng</w>": 37450, "braxton</w>": 37451, "antioxidant</w>": 37452, "skey</w>": 37453, "mpg</w>": 37454, "finding": 37455, "vibration</w>": 37456, "ceu</w>": 37457, "khart</w>": 37458, "dimini": 37459, "cline</w>": 37460, "shelly</w>": 37461, "hines</w>": 37462, "īï¸ı</w>": 37463, "topical</w>": 37464, "nover</w>": 37465, "maxx</w>": 37466, "primitive</w>": 37467, "illustrate</w>": 37468, "bounds</w>": 37469, "trenton</w>": 37470, "jointly</w>": 37471, "breeders</w>": 37472, "uchi": 37473, "wakeupamerica</w>": 37474, "bada</w>": 37475, "ðŁĹ£ï¸ı</w>": 37476, "guacam": 37477, "spheres</w>": 37478, "peregr": 37479, "youthful</w>": 37480, "lolo</w>": 37481, "birmin": 37482, "tly": 37483, "jeremycorbyn</w>": 37484, "defects</w>": 37485, "cosm": 37486, "arent</w>": 37487, "vaa</w>": 37488, "bagels</w>": 37489, "mediac": 37490, "coriander</w>": 37491, "icago</w>": 37492, "ghaz": 37493, "abbas": 37494, "remodel</w>": 37495, "structuring</w>": 37496, "pum": 37497, "outlaw": 37498, "adani</w>": 37499, "rbc</w>": 37500, "gulls</w>": 37501, "nli</w>": 37502, "confuse</w>": 37503, "ðŁĳĩðŁı¼</w>": 37504, "vila</w>": 37505, "mcnamara</w>": 37506, "corrections</w>": 37507, "mughal</w>": 37508, "seri</w>": 37509, "regain</w>": 37510, "ssb</w>": 37511, "leave": 37512, "hahahah": 37513, "grande": 37514, "distressed</w>": 37515, "rechargeable</w>": 37516, "hoa</w>": 37517, "housed</w>": 37518, "stil</w>": 37519, "attributed</w>": 37520, "opathic</w>": 37521, "dips</w>": 37522, "prit</w>": 37523, "headphone</w>": 37524, "conclude</w>": 37525, "pilo": 37526, "het": 37527, "utsa</w>": 37528, "nitin</w>": 37529, "jem</w>": 37530, "snippet</w>": 37531, "tutoring</w>": 37532, "oper</w>": 37533, "sunk</w>": 37534, "ensla": 37535, "chau</w>": 37536, "acorn</w>": 37537, "quintess": 37538, "rankin</w>": 37539, "affiliated</w>": 37540, "ourlives</w>": 37541, "clint": 37542, "seater</w>": 37543, "isaac": 37544, "bashing</w>": 37545, "smear</w>": 37546, "nurse": 37547, "doodling</w>": 37548, "\";</w>": 37549, "saku": 37550, "atrocities</w>": 37551, "imam": 37552, "gfs</w>": 37553, "violating</w>": 37554, "commend": 37555, "bradshaw</w>": 37556, "erville</w>": 37557, "billed</w>": 37558, "bbe</w>": 37559, "thulhu</w>": 37560, "iphones</w>": 37561, "moose": 37562, "dios</w>": 37563, "rew</w>": 37564, "methane</w>": 37565, "strangely</w>": 37566, "whisky": 37567, "tightly</w>": 37568, "spielberg</w>": 37569, "radius</w>": 37570, "noticing</w>": 37571, "wif</w>": 37572, "ignati": 37573, "ifa</w>": 37574, "apis</w>": 37575, "wali": 37576, "haitian</w>": 37577, "bushes</w>": 37578, "yz": 37579, "vl": 37580, "exited</w>": 37581, "assel</w>": 37582, "truec": 37583, "domen": 37584, "asher</w>": 37585, "inking</w>": 37586, "newyearseve</w>": 37587, "hendricks</w>": 37588, "bati</w>": 37589, "ìĿ´ì": 37590, "richter</w>": 37591, "monsanto</w>": 37592, "conline</w>": 37593, "agreat": 37594, "ðŁ¤¯</w>": 37595, "masterpieces</w>": 37596, "arn</w>": 37597, "roughs</w>": 37598, "cleve": 37599, "sev</w>": 37600, "fashions</w>": 37601, "toya</w>": 37602, "shail": 37603, "copeland</w>": 37604, "aquari": 37605, "decals</w>": 37606, "areyou": 37607, "yaya</w>": 37608, "astr": 37609, "font": 37610, "mlm</w>": 37611, "arca</w>": 37612, "ppor": 37613, "pollock</w>": 37614, "xperia</w>": 37615, "conservation": 37616, "chainsaw</w>": 37617, "aggie</w>": 37618, "?!?!?</w>": 37619, "sile": 37620, "shon</w>": 37621, "ìĹĲ": 37622, "notebooks</w>": 37623, "marquette</w>": 37624, "deus</w>": 37625, "bbled</w>": 37626, "spicer</w>": 37627, "mccabe</w>": 37628, "norwich": 37629, "modification</w>": 37630, "boosted</w>": 37631, "strum</w>": 37632, "salesman</w>": 37633, "bangle</w>": 37634, "nissan": 37635, "hezbollah</w>": 37636, "breasts</w>": 37637, "aaf": 37638, "anthus</w>": 37639, "sker": 37640, "owed</w>": 37641, "heros</w>": 37642, "gifs</w>": 37643, "fosters</w>": 37644, "eaters</w>": 37645, "dues</w>": 37646, "_/": 37647, "lymphoma</w>": 37648, "sfam</w>": 37649, "megal": 37650, "afridi</w>": 37651, "agic</w>": 37652, "pamp": 37653, "jealousy</w>": 37654, "ðŁĳĮðŁı¼": 37655, "calculate</w>": 37656, "napping</w>": 37657, "gale": 37658, "ðŁ¦Ħ</w>": 37659, "lubbock</w>": 37660, "assumed</w>": 37661, "renting</w>": 37662, "íĥľ": 37663, "suburb</w>": 37664, "ãĤ·": 37665, "technic</w>": 37666, "ucla": 37667, "infront</w>": 37668, "garnet</w>": 37669, "steroids</w>": 37670, "striving</w>": 37671, "howar": 37672, "mover</w>": 37673, "leton": 37674, "bulldo": 37675, "isin</w>": 37676, "ciao</w>": 37677, "snz</w>": 37678, "forefront</w>": 37679, "dams</w>": 37680, "midwife</w>": 37681, "mawards</w>": 37682, "clapton</w>": 37683, "wein</w>": 37684, "subsidies</w>": 37685, "sproud</w>": 37686, "rotherham</w>": 37687, "phantom": 37688, "arach": 37689, "spiel</w>": 37690, "racket</w>": 37691, "selamat</w>": 37692, "noon": 37693, "lbc</w>": 37694, "entially</w>": 37695, "ðŁĴ¸": 37696, "silve": 37697, "moud</w>": 37698, "kinetic</w>": 37699, "yasi": 37700, "ðŁİ©</w>": 37701, "ool": 37702, "miku</w>": 37703, "iza</w>": 37704, "fera</w>": 37705, "floren": 37706, "barbershop</w>": 37707, "groot</w>": 37708, "zest</w>": 37709, "nears</w>": 37710, "stanis": 37711, "zand": 37712, "policeman</w>": 37713, "jurisdic": 37714, "formations</w>": 37715, "apparatus</w>": 37716, "spd": 37717, "artifact</w>": 37718, "tosc": 37719, "motivating</w>": 37720, "womancrush": 37721, "redro": 37722, "diagnostics</w>": 37723, "raza</w>": 37724, "outfitters</w>": 37725, "elxn</w>": 37726, "dodgy</w>": 37727, "ryn</w>": 37728, "shd</w>": 37729, "orthodon": 37730, "olde</w>": 37731, "jayanti</w>": 37732, "balances</w>": 37733, "quickest</w>": 37734, "canton": 37735, "fridayreads</w>": 37736, "!*</w>": 37737, "naa</w>": 37738, "aak": 37739, "ðŁĶ·</w>": 37740, "behaviors</w>": 37741, "raspberries</w>": 37742, "ä»": 37743, "political": 37744, "camil": 37745, "åľ": 37746, "dik</w>": 37747, "astounding</w>": 37748, "liebe</w>": 37749, "novelty</w>": 37750, "turmoil</w>": 37751, "sully</w>": 37752, "springbreak</w>": 37753, "honouring</w>": 37754, "ccg</w>": 37755, "ðŁıĴ</w>": 37756, "mylittle": 37757, "kyc</w>": 37758, "proms</w>": 37759, "ðŁķĬ</w>": 37760, "Ã¨</w>": 37761, "bige": 37762, "avril</w>": 37763, "ðŁĩµðŁĩ°</w>": 37764, "marion": 37765, "asants</w>": 37766, "surya</w>": 37767, "octag": 37768, "lufthan": 37769, "acron": 37770, "fayetteville</w>": 37771, "tique</w>": 37772, "loves": 37773, "enca</w>": 37774, "dekalb</w>": 37775, "taver": 37776, "devote": 37777, "auxiliary</w>": 37778, "johannes</w>": 37779, "treadmill</w>": 37780, "ayan": 37781, "qur</w>": 37782, "donaldson</w>": 37783, "cheryl": 37784, "\"....</w>": 37785, "sven": 37786, "kirsty</w>": 37787, "gunners</w>": 37788, "radish</w>": 37789, "oahu</w>": 37790, "vsky</w>": 37791, "ible</w>": 37792, "concourse</w>": 37793, "bps</w>": 37794, "eloqu": 37795, "ashford</w>": 37796, "tebow</w>": 37797, "roblox</w>": 37798, "mada</w>": 37799, "driving": 37800, "thday</w>": 37801, "sproject</w>": 37802, "mms</w>": 37803, "banded</w>": 37804, ".!!</w>": 37805, "librarians</w>": 37806, "flannel</w>": 37807, "intolerance</w>": 37808, "heral": 37809, "çµ": 37810, "nemesis</w>": 37811, "lista</w>": 37812, "tarak</w>": 37813, "crypt</w>": 37814, "starplus</w>": 37815, "vishnu</w>": 37816, "scale": 37817, "cris</w>": 37818, "%),</w>": 37819, "jillian</w>": 37820, "reggae": 37821, "pegasus</w>": 37822, "olin": 37823, "ipment</w>": 37824, "manic</w>": 37825, "lfc": 37826, "goddard</w>": 37827, "iteam</w>": 37828, "parlour</w>": 37829, "anchors</w>": 37830, "leeminho</w>": 37831, "tallahassee</w>": 37832, "antit": 37833, "dho": 37834, "kidney": 37835, "yash</w>": 37836, "battled</w>": 37837, "azad</w>": 37838, "garis</w>": 37839, "faulkner</w>": 37840, "sniff</w>": 37841, "paparazzi</w>": 37842, "edm": 37843, "phyllis</w>": 37844, "contested</w>": 37845, "aaay</w>": 37846, "seca</w>": 37847, "kton</w>": 37848, "velve": 37849, "rainier</w>": 37850, "forum": 37851, "tampab": 37852, "hosp</w>": 37853, "tractors</w>": 37854, "oxfordshire</w>": 37855, "notion</w>": 37856, "guangzhou</w>": 37857, "ðŁĺ¯</w>": 37858, "refill</w>": 37859, "wednesdaymotivation</w>": 37860, "slider</w>": 37861, "mukherjee</w>": 37862, "pratt": 37863, "fontaine</w>": 37864, "alphon": 37865, "afar</w>": 37866, "tsi</w>": 37867, "pesticides</w>": 37868, "fiends</w>": 37869, "mocking</w>": 37870, "braw": 37871, "transat": 37872, "doses</w>": 37873, "cores</w>": 37874, "homophobia</w>": 37875, "documenting</w>": 37876, "zlatan</w>": 37877, "condoms</w>": 37878, "sÃ©": 37879, "sunset": 37880, "kunst</w>": 37881, "tonga</w>": 37882, "à¸ª": 37883, "vation</w>": 37884, "spray": 37885, "chowder</w>": 37886, "raps</w>": 37887, "palladium</w>": 37888, "norwood</w>": 37889, "musichistory</w>": 37890, "hooker</w>": 37891, "sisi</w>": 37892, "osprey</w>": 37893, "phys</w>": 37894, "conceded</w>": 37895, "bobcat</w>": 37896, "armad": 37897, "zeit": 37898, "ÙĦ</w>": 37899, "ðŁĺģðŁĺģ": 37900, "meridi": 37901, "ðŁĩ·ðŁĩº</w>": 37902, "cornwall": 37903, "!),</w>": 37904, "touchdowns</w>": 37905, "zeit</w>": 37906, "chalet</w>": 37907, "mmm": 37908, "alche": 37909, "gorilla": 37910, "foss</w>": 37911, "atiku</w>": 37912, "luminous</w>": 37913, "ivanka</w>": 37914, "beek</w>": 37915, "stares</w>": 37916, "swiss": 37917, "âĿ¤âĿ¤âĿ¤âĿ¤": 37918, "scrubs</w>": 37919, "meath</w>": 37920, "gustav</w>": 37921, "jogging</w>": 37922, "confetti</w>": 37923, "asos</w>": 37924, "ersfc</w>": 37925, "breitbart</w>": 37926, "applicable</w>": 37927, "authored</w>": 37928, "yaho": 37929, "hin</w>": 37930, "displacement</w>": 37931, "jv": 37932, "ðŁĮ¹ðŁĮ¹": 37933, "otc</w>": 37934, "nonprofits</w>": 37935, "diecast</w>": 37936, "gusto</w>": 37937, "intestin": 37938, "cages</w>": 37939, "meen": 37940, "lukas</w>": 37941, "mooney</w>": 37942, "ðŁĺ·": 37943, "veryday</w>": 37944, "torah</w>": 37945, "ission</w>": 37946, "wac</w>": 37947, "leveraging</w>": 37948, "ishable</w>": 37949, "cuse</w>": 37950, "lewood</w>": 37951, "mayan</w>": 37952, "turntable</w>": 37953, "juice": 37954, "trusty</w>": 37955, "tup": 37956, "etiquette</w>": 37957, "supervisors</w>": 37958, "stun</w>": 37959, "guzman</w>": 37960, "conferen": 37961, "rico": 37962, "feast": 37963, "backward</w>": 37964, "polaris</w>": 37965, "miche": 37966, "jog": 37967, "hing": 37968, "fieldhouse</w>": 37969, "veling</w>": 37970, "shocker</w>": 37971, "escence</w>": 37972, "à¤¾": 37973, "vibe": 37974, "anastasia</w>": 37975, "marched</w>": 37976, "killing": 37977, "Ķë": 37978, "fett</w>": 37979, "exoplan": 37980, "...(</w>": 37981, "snowday</w>": 37982, "loh</w>": 37983, "irani</w>": 37984, "lakhs</w>": 37985, "dela</w>": 37986, "pocaly": 37987, "boomers</w>": 37988, "dictatorship</w>": 37989, "acer": 37990, "turkeys</w>": 37991, "quarterfinal</w>": 37992, "musketeers</w>": 37993, "ðŁĴĽðŁĴļ": 37994, "sfx</w>": 37995, "museumweek</w>": 37996, "scala</w>": 37997, "risis</w>": 37998, "(ðŁĵ·</w>": 37999, "ãĢĤ</w>": 38000, "zies</w>": 38001, "boeh": 38002, "hues</w>": 38003, "lusci": 38004, "dola</w>": 38005, "impeachtrump</w>": 38006, "rood</w>": 38007, "doncaster": 38008, "torre</w>": 38009, "heroes": 38010, "foyer</w>": 38011, "tari</w>": 38012, "blurred</w>": 38013, "kew": 38014, "frankly</w>": 38015, "droid</w>": 38016, "apal": 38017, "Ð¼": 38018, "yaf": 38019, "bret": 38020, "paragu": 38021, "cacao</w>": 38022, "ðŁĻĮðŁı¾": 38023, "rue": 38024, "headaches</w>": 38025, "shawty</w>": 38026, "charley</w>": 38027, "paler": 38028, "gowns</w>": 38029, "correctional</w>": 38030, "ðŁĺ©ðŁĺ©</w>": 38031, "breakingbad</w>": 38032, "oling</w>": 38033, "dap</w>": 38034, "endeavour</w>": 38035, "citadel</w>": 38036, "trad</w>": 38037, "incumbent</w>": 38038, "meditate</w>": 38039, "footed</w>": 38040, "ðŁĴµ</w>": 38041, "shabbat</w>": 38042, "dayofthe": 38043, "willem</w>": 38044, "galway": 38045, "tored</w>": 38046, "marriage": 38047, "fillion</w>": 38048, "sleeveless</w>": 38049, "auditor</w>": 38050, "jinyoung</w>": 38051, "invincible</w>": 38052, "kaduna</w>": 38053, "aand": 38054, "volcanoes</w>": 38055, "moneti": 38056, "indiegogo</w>": 38057, "buccaneers</w>": 38058, "ðŁĳīðŁı½</w>": 38059, "ãĢĤ": 38060, "layton</w>": 38061, "cuckoo</w>": 38062, "humber</w>": 38063, "buzzer</w>": 38064, "Ïī</w>": 38065, "tore": 38066, "strains</w>": 38067, "stom</w>": 38068, "paine</w>": 38069, "swe</w>": 38070, "duff": 38071, "zou": 38072, "simi</w>": 38073, "lipp": 38074, "urn</w>": 38075, "seagu": 38076, "ðŁĶ®</w>": 38077, "sundae</w>": 38078, "hic</w>": 38079, "ðŁĺ¨</w>": 38080, "bullpen</w>": 38081, "uper": 38082, "flyover</w>": 38083, "aldridge</w>": 38084, "globes</w>": 38085, "alies</w>": 38086, "kenzie</w>": 38087, "gees</w>": 38088, "ycle</w>": 38089, "splin": 38090, "magenta</w>": 38091, "jha</w>": 38092, "balu": 38093, "ghorn</w>": 38094, "tipper": 38095, "wicker</w>": 38096, "tasteof": 38097, "conclave</w>": 38098, "chale</w>": 38099, "invasi": 38100, "cater</w>": 38101, "dioxide</w>": 38102, "megab": 38103, "winn</w>": 38104, "atp": 38105, "transformative</w>": 38106, "nestled</w>": 38107, "hig": 38108, "bridging</w>": 38109, "lilies</w>": 38110, "cheered</w>": 38111, "baddest</w>": 38112, "scrolls</w>": 38113, "realis</w>": 38114, "diplo</w>": 38115, "ðŁĶ«": 38116, "concession</w>": 38117, "preferences</w>": 38118, "explodes</w>": 38119, "ergon": 38120, "introductory</w>": 38121, "ineau</w>": 38122, "chaf": 38123, "somes</w>": 38124, "landrover</w>": 38125, "spiration</w>": 38126, "sexy</w>": 38127, "scorecard</w>": 38128, "illustrates</w>": 38129, "soulmate</w>": 38130, "wien</w>": 38131, "interdisciplinary</w>": 38132, "forecasting</w>": 38133, "entities</w>": 38134, "glued</w>": 38135, "enlar": 38136, "curt</w>": 38137, "perceptions</w>": 38138, "bootleg</w>": 38139, "mire": 38140, "ashok</w>": 38141, "vaz": 38142, "horne</w>": 38143, "calle</w>": 38144, "aculture</w>": 38145, "theroy": 38146, "nighttime</w>": 38147, "ocal</w>": 38148, "characterdesign</w>": 38149, "armist": 38150, "ðŁĺıðŁĺı</w>": 38151, "yahoo": 38152, "aceae</w>": 38153, "tose</w>": 38154, "evento</w>": 38155, "sout": 38156, "nayanth": 38157, "whom": 38158, "vare": 38159, "rigging</w>": 38160, "genus</w>": 38161, "hive": 38162, "commands</w>": 38163, "stie": 38164, "daya</w>": 38165, "ethanol</w>": 38166, "enf": 38167, "hifi</w>": 38168, "fluence</w>": 38169, "clemson": 38170, "reinvent</w>": 38171, "thermometer</w>": 38172, "humorous</w>": 38173, "emerging": 38174, "aciÃ³n</w>": 38175, "ðŁĺĺðŁĺį</w>": 38176, "sity": 38177, "hawke</w>": 38178, "accompanying</w>": 38179, "tility</w>": 38180, "ðŁĺª": 38181, "recess</w>": 38182, "protagonist</w>": 38183, "lery</w>": 38184, "dundal": 38185, "intl": 38186, "brittany": 38187, "qbs</w>": 38188, "offthe": 38189, "marriages</w>": 38190, "howto": 38191, "violated</w>": 38192, "adelaide": 38193, "witt": 38194, "lancer</w>": 38195, "pakv": 38196, "hume</w>": 38197, "stade</w>": 38198, "bragging</w>": 38199, "outright</w>": 38200, "adc</w>": 38201, "superst": 38202, "realtime</w>": 38203, "cures</w>": 38204, "gardeners</w>": 38205, "erock</w>": 38206, "dalejr</w>": 38207, "vero</w>": 38208, "bartol": 38209, "moti": 38210, "mcfly</w>": 38211, "vpn</w>": 38212, "stink</w>": 38213, "overrated</w>": 38214, "guerra</w>": 38215, "etis": 38216, "athome</w>": 38217, "twdfamily</w>": 38218, "thab": 38219, "tnx</w>": 38220, "rafael": 38221, "familytravel</w>": 38222, "xley</w>": 38223, "satanic</w>": 38224, "equations</w>": 38225, "rudy": 38226, "waldorf</w>": 38227, "stani</w>": 38228, "tube": 38229, "measles</w>": 38230, "zimmerman</w>": 38231, "obligations</w>": 38232, "iously</w>": 38233, "bowser</w>": 38234, "transformer</w>": 38235, "shoppe</w>": 38236, "shaken</w>": 38237, "ghouse</w>": 38238, "tod": 38239, "ketball</w>": 38240, "shareholder</w>": 38241, "marca</w>": 38242, "kpmg</w>": 38243, "akan</w>": 38244, "givenchy</w>": 38245, "coastal": 38246, "auth</w>": 38247, "rollercoaster</w>": 38248, "marches</w>": 38249, "coordinate</w>": 38250, "cinema": 38251, "apprentices</w>": 38252, "parlor</w>": 38253, "mito": 38254, "menon</w>": 38255, "considerable</w>": 38256, "barre</w>": 38257, "gloss": 38258, "enhances</w>": 38259, "jazeera</w>": 38260, "falmouth</w>": 38261, "thrash</w>": 38262, "staten</w>": 38263, "kzn</w>": 38264, "engel": 38265, "samanthap": 38266, "floppy</w>": 38267, "salom": 38268, "ðŁıĨðŁıĨ</w>": 38269, "wack</w>": 38270, "deliberate</w>": 38271, "oscill": 38272, "heritag": 38273, "dusted</w>": 38274, "ornithology</w>": 38275, "paddle": 38276, "ferns</w>": 38277, "barun": 38278, "clans</w>": 38279, "anticipate</w>": 38280, "aay": 38281, "matically</w>": 38282, "éĩ": 38283, "tumble</w>": 38284, "postman</w>": 38285, "unicef": 38286, "trotter</w>": 38287, "opd</w>": 38288, "leaflet</w>": 38289, "geist</w>": 38290, "ceasefire</w>": 38291, "screws</w>": 38292, "creation": 38293, "walnuts</w>": 38294, "longhorns</w>": 38295, "understatement</w>": 38296, "abb</w>": 38297, "proximity</w>": 38298, "nax": 38299, "unity": 38300, "turnpike</w>": 38301, "ordained</w>": 38302, "dubstep</w>": 38303, "chakra": 38304, "mech</w>": 38305, "loveher</w>": 38306, "lookalike</w>": 38307, "donnein": 38308, "viron": 38309, "ÙĪ</w>": 38310, "bangers</w>": 38311, "variants</w>": 38312, "outdated</w>": 38313, "inta</w>": 38314, "cristo</w>": 38315, "spelt</w>": 38316, "foodand": 38317, "fon</w>": 38318, "stefani</w>": 38319, "marginal</w>": 38320, "hutton</w>": 38321, "tiara</w>": 38322, "telford</w>": 38323, "quen</w>": 38324, "fairgrounds</w>": 38325, "quetta</w>": 38326, "mikhail</w>": 38327, "healer</w>": 38328, "vball</w>": 38329, "tyre": 38330, "undergrad</w>": 38331, "glend": 38332, "homers</w>": 38333, "scribed</w>": 38334, "maintains</w>": 38335, "poche": 38336, "missal</w>": 38337, "marko</w>": 38338, "uas</w>": 38339, "Ã¡n": 38340, "shp</w>": 38341, "convey</w>": 38342, "padre</w>": 38343, "saba</w>": 38344, "puglia</w>": 38345, "madhuri": 38346, "paxton</w>": 38347, "chaplain</w>": 38348, "nago": 38349, "casi": 38350, "...!!!</w>": 38351, "flirt</w>": 38352, "saleh</w>": 38353, "kare</w>": 38354, "dire": 38355, "stamped</w>": 38356, "extreme": 38357, "ðŁĺĥðŁĺĥ</w>": 38358, "hoppy</w>": 38359, "guadalupe</w>": 38360, "advantaged</w>": 38361, "euchar": 38362, "plow</w>": 38363, "unn</w>": 38364, "macqu": 38365, "portland": 38366, "clash": 38367, "pes": 38368, "loubout": 38369, "yp": 38370, "keeping": 38371, "arcadia</w>": 38372, "frankie": 38373, "fiu</w>": 38374, "deth</w>": 38375, "encyclopedia</w>": 38376, "size": 38377, "invests</w>": 38378, "ðŁį©</w>": 38379, "geological</w>": 38380, "franÃ§": 38381, "confront</w>": 38382, "ðŁĺ¥": 38383, "dys</w>": 38384, "afm</w>": 38385, "texan</w>": 38386, "graphene</w>": 38387, "repostapp</w>": 38388, "acf</w>": 38389, "ursula</w>": 38390, "gaza": 38391, "ddled</w>": 38392, "fum</w>": 38393, "wsbtv</w>": 38394, "mbe": 38395, "frontiers</w>": 38396, "chronograph</w>": 38397, "kes": 38398, "interfaith</w>": 38399, "taboo</w>": 38400, "sparta</w>": 38401, "wondo</w>": 38402, "florist</w>": 38403, "embraces</w>": 38404, "caw": 38405, "noel": 38406, "archers</w>": 38407, "ðŁĲ·</w>": 38408, "romano</w>": 38409, "banan": 38410, "shakers</w>": 38411, "melodies</w>": 38412, "geothermal</w>": 38413, "sephora</w>": 38414, "ìļ°": 38415, "Ð¾Ð´": 38416, "proc": 38417, "handshake</w>": 38418, "pande": 38419, "populated</w>": 38420, "slowdown</w>": 38421, "hortons</w>": 38422, "registrations</w>": 38423, "undeni": 38424, "lants</w>": 38425, "passover</w>": 38426, "thakur</w>": 38427, "lief</w>": 38428, "adhesive</w>": 38429, "petal": 38430, "microscopy</w>": 38431, "memphis": 38432, "confirming</w>": 38433, "airdrop</w>": 38434, "mesmer": 38435, "perceived</w>": 38436, "mingle</w>": 38437, "lifeline</w>": 38438, "ghj": 38439, "worcestershire</w>": 38440, "passions</w>": 38441, "acher": 38442, "ellar</w>": 38443, "aho</w>": 38444, "firenze</w>": 38445, "barang": 38446, "letterman</w>": 38447, "hatfield</w>": 38448, "lucha</w>": 38449, "jeter</w>": 38450, "eshop": 38451, "williams": 38452, "horoscope</w>": 38453, "prede": 38454, "eastbourne</w>": 38455, "durga</w>": 38456, "diversion</w>": 38457, "altrin": 38458, "seismic</w>": 38459, "premiosm": 38460, "narco": 38461, "tir</w>": 38462, "orig</w>": 38463, "orm</w>": 38464, "landfall</w>": 38465, "cious</w>": 38466, "lindo</w>": 38467, "maxine</w>": 38468, "xico</w>": 38469, "tray": 38470, "oswald</w>": 38471, "cba</w>": 38472, "ricotta</w>": 38473, "ncr</w>": 38474, "marau": 38475, "à¸²</w>": 38476, "gladiator</w>": 38477, "chery</w>": 38478, "lung": 38479, "ume": 38480, "popsic": 38481, "longing</w>": 38482, "canals</w>": 38483, "taya</w>": 38484, "decentralized</w>": 38485, "shopp": 38486, "pressures</w>": 38487, "maharaj</w>": 38488, "etihad</w>": 38489, "walgreens</w>": 38490, "succession</w>": 38491, "signaling</w>": 38492, "lig</w>": 38493, "staffer</w>": 38494, "northkorea</w>": 38495, "defying</w>": 38496, "asma</w>": 38497, "deg</w>": 38498, "perimeter</w>": 38499, "oakville</w>": 38500, "msk": 38501, "baltimore": 38502, "receip": 38503, "deple": 38504, "ðŁĺŃðŁĺĤ</w>": 38505, "jamboree</w>": 38506, ">.<</w>": 38507, "rspb": 38508, "punisher</w>": 38509, "considerably</w>": 38510, "intothe": 38511, "parisian</w>": 38512, "accelerated</w>": 38513, "polyester</w>": 38514, "lowes</w>": 38515, "frying</w>": 38516, "sautÃ©ed</w>": 38517, "mouths</w>": 38518, "seychelles</w>": 38519, "rax</w>": 38520, "godis": 38521, "dakota": 38522, "housewives</w>": 38523, "theme": 38524, "matinee</w>": 38525, "blackbird</w>": 38526, "yesung</w>": 38527, "prefers</w>": 38528, "pellegr": 38529, "inated</w>": 38530, "trunks</w>": 38531, "strongertogether</w>": 38532, "repet": 38533, "repairing</w>": 38534, "pedals</w>": 38535, "tolerant</w>": 38536, "herr</w>": 38537, "dunne</w>": 38538, "indication</w>": 38539, "decatur</w>": 38540, "btv</w>": 38541, "exhibitors</w>": 38542, "ikon": 38543, "fridaymotivation</w>": 38544, "bragg</w>": 38545, "livetweet</w>": 38546, "alves</w>": 38547, "womensart</w>": 38548, "foreigners</w>": 38549, "wallets</w>": 38550, "mindy</w>": 38551, "laney</w>": 38552, "bbin</w>": 38553, "tvmiaw</w>": 38554, "lifter</w>": 38555, "target": 38556, "tame": 38557, "drou": 38558, "astrophotography</w>": 38559, "mpc</w>": 38560, "gpu</w>": 38561, "nordstrom</w>": 38562, "friction</w>": 38563, "runoff</w>": 38564, "lovable</w>": 38565, "spnfamily</w>": 38566, "extingui": 38567, "bloody": 38568, "schel</w>": 38569, "artistry</w>": 38570, "swish</w>": 38571, "scarce</w>": 38572, "phils</w>": 38573, "maxim</w>": 38574, "possum</w>": 38575, "compromised</w>": 38576, "styli": 38577, "scfc</w>": 38578, "issa": 38579, "birmingham": 38580, "sketched</w>": 38581, "angelica</w>": 38582, "ordinance</w>": 38583, "jets": 38584, "conquer": 38585, "ðŁĺĲ": 38586, "onlineshopping</w>": 38587, "sori": 38588, "reasonably</w>": 38589, "nuestro</w>": 38590, "arturo</w>": 38591, "chl</w>": 38592, "benefici": 38593, "sphoto</w>": 38594, "welt</w>": 38595, "nikk": 38596, "ðŁ¤ŀ</w>": 38597, "danao</w>": 38598, "formid": 38599, "asse</w>": 38600, "afirst</w>": 38601, "âľĤ": 38602, "gillette</w>": 38603, "assor": 38604, "anonym": 38605, "selca</w>": 38606, "femi</w>": 38607, "bearable</w>": 38608, "yand": 38609, "armory</w>": 38610, "crepe</w>": 38611, "celticfc</w>": 38612, "bravo": 38613, "inexpensive</w>": 38614, "delec": 38615, "gecko</w>": 38616, "newmarket</w>": 38617, "snowflakes</w>": 38618, "kabir</w>": 38619, "contra</w>": 38620, "canning</w>": 38621, "morpho": 38622, "garwal</w>": 38623, "ðŁĴĥðŁı»</w>": 38624, "fighting": 38625, "mutation</w>": 38626, "woody": 38627, "jugg": 38628, "graces</w>": 38629, "premiosmtvmiaw</w>": 38630, "kennedy": 38631, "gup": 38632, "sae": 38633, "opha": 38634, "offspring</w>": 38635, "finisher</w>": 38636, "betts</w>": 38637, "spanning</w>": 38638, "marj": 38639, "hone</w>": 38640, "shing": 38641, "continents</w>": 38642, "samanthaprabhu</w>": 38643, "unrelated</w>": 38644, "lacy</w>": 38645, "explosions</w>": 38646, "benjamin": 38647, "sophie": 38648, "noting</w>": 38649, "microsoft": 38650, "assen</w>": 38651, "ahoy</w>": 38652, "iker</w>": 38653, "hofer</w>": 38654, "moe": 38655, "ahmadi": 38656, "yann</w>": 38657, "anak</w>": 38658, "mahi</w>": 38659, "beu": 38660, "ahah</w>": 38661, "creeper</w>": 38662, "baahubali</w>": 38663, "amat": 38664, "priory</w>": 38665, "hawkeye</w>": 38666, "deloitte</w>": 38667, "skoda</w>": 38668, "printmaking</w>": 38669, "assembling</w>": 38670, "miraculous</w>": 38671, "noch</w>": 38672, "swo": 38673, "lega</w>": 38674, "operates</w>": 38675, "borderlands</w>": 38676, "elie": 38677, "strongh": 38678, "reptiles</w>": 38679, "pirate": 38680, "unfold</w>": 38681, "Â¯": 38682, "qualcomm</w>": 38683, "unpredictable</w>": 38684, "otr</w>": 38685, "rosewood</w>": 38686, "directional</w>": 38687, "counselors</w>": 38688, "cornell": 38689, "liberated</w>": 38690, "jad</w>": 38691, "irregular</w>": 38692, "bulgarian</w>": 38693, "highness</w>": 38694, "vodafone</w>": 38695, "swild</w>": 38696, "minimize</w>": 38697, "grazie</w>": 38698, "à¹ĩ</w>": 38699, "rstats</w>": 38700, "streep</w>": 38701, "ometric</w>": 38702, "humble": 38703, "lump</w>": 38704, "lille</w>": 38705, "bÃ¼": 38706, "homedepot</w>": 38707, "tripadvisor</w>": 38708, "kiwan": 38709, "avia</w>": 38710, "erz</w>": 38711, "exico</w>": 38712, "duf": 38713, "blumen": 38714, "mizing</w>": 38715, "arma</w>": 38716, "inim": 38717, "constan": 38718, "sora</w>": 38719, "jual</w>": 38720, "aun</w>": 38721, "twell": 38722, "trenches</w>": 38723, "hera</w>": 38724, "rk": 38725, "poplar</w>": 38726, "recipeoftheday</w>": 38727, "llan</w>": 38728, "bhuban": 38729, "shortages</w>": 38730, "ingdon</w>": 38731, "bridgewater</w>": 38732, "ðŁĲĺ</w>": 38733, "fortnite": 38734, "camden": 38735, "uncture</w>": 38736, "prow": 38737, "colonies</w>": 38738, "tks</w>": 38739, "ngo": 38740, "bhm</w>": 38741, "livepd": 38742, "splace</w>": 38743, "slike": 38744, "happyeaster</w>": 38745, "terrence</w>": 38746, "revolver</w>": 38747, "jed</w>": 38748, "yyyy": 38749, "officeof": 38750, "mts</w>": 38751, "existential</w>": 38752, "rourke</w>": 38753, "explorebc</w>": 38754, "ssed": 38755, "priest": 38756, "vixen</w>": 38757, "siding</w>": 38758, "kpa</w>": 38759, "ahar</w>": 38760, "juic": 38761, "obstruc": 38762, "forensics</w>": 38763, "ukmfg</w>": 38764, "cancellation</w>": 38765, "weary</w>": 38766, "abq</w>": 38767, "elec</w>": 38768, "prized</w>": 38769, "debts</w>": 38770, "mezz": 38771, "salvatore</w>": 38772, "mdc": 38773, "grette</w>": 38774, "cgc</w>": 38775, "thon": 38776, "snowstorm</w>": 38777, "tsch</w>": 38778, "cookery</w>": 38779, "å¹": 38780, "waxing</w>": 38781, "nacional</w>": 38782, "murs</w>": 38783, "rave": 38784, "capes</w>": 38785, "germain</w>": 38786, "dripping</w>": 38787, "submitting</w>": 38788, "omelette</w>": 38789, "iteration</w>": 38790, "ajes</w>": 38791, "shimmer</w>": 38792, "fueling</w>": 38793, "ðŁĩ§ðŁĩª</w>": 38794, "lipo": 38795, "bobble</w>": 38796, "unfollow</w>": 38797, "islamist</w>": 38798, "hiber": 38799, "cats": 38800, "agentsofshield</w>": 38801, "sensi": 38802, "_____</w>": 38803, "steria</w>": 38804, "instal": 38805, "auspicious</w>": 38806, "harrow</w>": 38807, "overland</w>": 38808, "feminists</w>": 38809, "instant": 38810, "chariot</w>": 38811, "blindness</w>": 38812, "sped</w>": 38813, "scarec": 38814, "nuit</w>": 38815, "miniatures</w>": 38816, "hoseok</w>": 38817, "glock</w>": 38818, "fifaworldcup</w>": 38819, "ete": 38820, "dism</w>": 38821, "weiner</w>": 38822, "exfoli": 38823, "earts</w>": 38824, "à¸Ķ</w>": 38825, "myart</w>": 38826, "manil": 38827, "issant</w>": 38828, "forma</w>": 38829, "incu": 38830, "buffalob": 38831, "intim": 38832, "mccul": 38833, "anjali</w>": 38834, "popo": 38835, "undoub": 38836, "hila</w>": 38837, "fungal</w>": 38838, "thankful": 38839, "futur": 38840, "endish</w>": 38841, "rends</w>": 38842, "thar</w>": 38843, "sheff": 38844, "ringo</w>": 38845, "nicholls</w>": 38846, "iowa": 38847, "potom": 38848, "clams</w>": 38849, "ãģĦ</w>": 38850, "aconf</w>": 38851, "stadiums</w>": 38852, "dimp": 38853, "dik": 38854, "residences</w>": 38855, "dov</w>": 38856, "caricature</w>": 38857, "seagull</w>": 38858, "klm</w>": 38859, "confess</w>": 38860, "slapped</w>": 38861, "celeb": 38862, "turbines</w>": 38863, "ppv</w>": 38864, "nurture</w>": 38865, "elab</w>": 38866, ".....#</w>": 38867, "tuff</w>": 38868, "depress": 38869, "alfar": 38870, "amiibo</w>": 38871, "dispon": 38872, "ewing</w>": 38873, "queer": 38874, "friends": 38875, "forre": 38876, "âĺ¼</w>": 38877, "swt</w>": 38878, "aquarius</w>": 38879, "headliner</w>": 38880, "curd</w>": 38881, "figs</w>": 38882, "otters</w>": 38883, "lovefl</w>": 38884, "kareem</w>": 38885, "govegan</w>": 38886, "friyay</w>": 38887, "consolation</w>": 38888, "atri</w>": 38889, "ì§Ħ</w>": 38890, "âĺĿï¸ı</w>": 38891, "polyne": 38892, "gued</w>": 38893, "oya</w>": 38894, "laus": 38895, "intestinal</w>": 38896, "camilla</w>": 38897, "scalp</w>": 38898, "pir</w>": 38899, "leeds": 38900, "horrifying</w>": 38901, "boretum</w>": 38902, "dandelion</w>": 38903, "ferrer</w>": 38904, "ellic": 38905, "asx</w>": 38906, "soren": 38907, "reloaded</w>": 38908, "aleague</w>": 38909, "navigator</w>": 38910, "inette</w>": 38911, "addams</w>": 38912, "alchemist</w>": 38913, "akshay</w>": 38914, "dystopian</w>": 38915, "awec</w>": 38916, "naya</w>": 38917, "alisa</w>": 38918, "ailed</w>": 38919, "agor": 38920, "aviator</w>": 38921, "alizer</w>": 38922, "smobile</w>": 38923, "findyourpark</w>": 38924, "copying</w>": 38925, "toddy</w>": 38926, "shti</w>": 38927, "monger</w>": 38928, "calhoun</w>": 38929, "napkin</w>": 38930, "breakup</w>": 38931, "yatra</w>": 38932, "sethu": 38933, "richi": 38934, "erasmus</w>": 38935, "ferry": 38936, "amore": 38937, "practise</w>": 38938, "bobo</w>": 38939, "powerpoint</w>": 38940, "oose</w>": 38941, "liffe</w>": 38942, "china": 38943, "shka</w>": 38944, "fadnavis</w>": 38945, "duane</w>": 38946, "waron": 38947, "false": 38948, "ðŁļĤ</w>": 38949, "washes</w>": 38950, "discip": 38951, "========": 38952, "gk": 38953, "abb": 38954, "stubborn</w>": 38955, "medieval": 38956, "pci</w>": 38957, "ðŁįª</w>": 38958, "marilyn": 38959, "hyo": 38960, "mandi": 38961, "cri</w>": 38962, "predecess": 38963, "continuation</w>": 38964, "omusic</w>": 38965, "slat": 38966, "whal": 38967, "mallory</w>": 38968, "bonn</w>": 38969, "shenzhen</w>": 38970, "cai": 38971, "âĺĥ": 38972, "safest</w>": 38973, "forwards</w>": 38974, "drawers</w>": 38975, "blasted</w>": 38976, "slee</w>": 38977, "morphe": 38978, "mbta</w>": 38979, "dumbass</w>": 38980, "ÑĦÐ¾ÑĤÐ¾</w>": 38981, "alhamdulillah</w>": 38982, "eclub</w>": 38983, "albeit</w>": 38984, "healey</w>": 38985, "ayurveda</w>": 38986, "advertised</w>": 38987, "crocs</w>": 38988, "ittles</w>": 38989, "bryson</w>": 38990, "bei": 38991, "njpw</w>": 38992, "honoree</w>": 38993, "fused</w>": 38994, "ðŁĶĺ</w>": 38995, "multin": 38996, "naga</w>": 38997, "departs</w>": 38998, "kop</w>": 38999, "kino</w>": 39000, "jharkhand</w>": 39001, "edna</w>": 39002, "axle</w>": 39003, "milton": 39004, "supremacist</w>": 39005, "marrakech</w>": 39006, "dominic": 39007, "transcript</w>": 39008, "][#</w>": 39009, ":).</w>": 39010, "woc</w>": 39011, "surrounds</w>": 39012, "ogil": 39013, "leaflets</w>": 39014, "cowell</w>": 39015, "whew</w>": 39016, "trude</w>": 39017, "prolifer": 39018, "succes": 39019, "sportsman</w>": 39020, "condom</w>": 39021, "poche</w>": 39022, "kup": 39023, "imprisonment</w>": 39024, "{}</w>": 39025, "scrambled</w>": 39026, "åĽ": 39027, "kaine</w>": 39028, "cellphone</w>": 39029, "metamor": 39030, "coni": 39031, "remnants</w>": 39032, "eez</w>": 39033, "downpour</w>": 39034, "afternoon": 39035, "exercising</w>": 39036, "berser": 39037, "architecture": 39038, "wicklow</w>": 39039, "mns</w>": 39040, "isp</w>": 39041, "boc</w>": 39042, "niss</w>": 39043, "mnwild</w>": 39044, "stumble</w>": 39045, "rsi</w>": 39046, "luffy</w>": 39047, "silen": 39048, "ddad</w>": 39049, "bullies</w>": 39050, "hawker</w>": 39051, "bbcc": 39052, "scuba": 39053, "epp": 39054, "quets</w>": 39055, "foraging</w>": 39056, "pallet</w>": 39057, "hadi</w>": 39058, "cinematographer</w>": 39059, "catchers</w>": 39060, "toaster</w>": 39061, "khi": 39062, "litecoin</w>": 39063, "kidlit": 39064, "amherst</w>": 39065, "mauricio</w>": 39066, "ipad": 39067, "marmalade</w>": 39068, "fey": 39069, "donnelly</w>": 39070, "gto</w>": 39071, "estas</w>": 39072, "cerebral</w>": 39073, "antgrasso</w>": 39074, "zzled</w>": 39075, "virgil</w>": 39076, "swapped</w>": 39077, "ðŁĺħðŁĺħ</w>": 39078, "nodapl</w>": 39079, "greatest": 39080, "nhlbruins</w>": 39081, "fraser": 39082, "bmo</w>": 39083, "anew": 39084, ".âĿ¤ï¸ı</w>": 39085, "segregation</w>": 39086, "remarkably</w>": 39087, "mccormick</w>": 39088, "logger</w>": 39089, "eras</w>": 39090, "contracting</w>": 39091, "âłĢâłĢ</w>": 39092, "yorks</w>": 39093, "ukulele</w>": 39094, "touchscreen</w>": 39095, "decked</w>": 39096, "benn</w>": 39097, "southwark</w>": 39098, "ravin": 39099, "numis": 39100, "ðŁ¤Ļ</w>": 39101, "rut</w>": 39102, "greco</w>": 39103, "ethic</w>": 39104, "redneck</w>": 39105, "arr": 39106, "tcs</w>": 39107, "ihri": 39108, "ðŁĩ«ðŁĩ·": 39109, "lk": 39110, "inherited</w>": 39111, "zyk</w>": 39112, "viaduct</w>": 39113, "martyred</w>": 39114, "higu": 39115, "ssn</w>": 39116, "bein": 39117, "streetstyle</w>": 39118, "fergie</w>": 39119, "bankof": 39120, "æĹ¥": 39121, "stakeholder</w>": 39122, "exemplary</w>": 39123, "cress</w>": 39124, "essa</w>": 39125, "erotica</w>": 39126, "intrepid</w>": 39127, "gomes</w>": 39128, "braun": 39129, "bethany": 39130, "bangtan</w>": 39131, "pulmonary</w>": 39132, "milling</w>": 39133, "doctorate</w>": 39134, "trumprussia</w>": 39135, "à¤°": 39136, "sani": 39137, "blatt</w>": 39138, "plau": 39139, "deprived</w>": 39140, "tle": 39141, "fully": 39142, "bourn</w>": 39143, "stak": 39144, "lufthansa</w>": 39145, "kiosk</w>": 39146, "faroo": 39147, "defy</w>": 39148, "badan</w>": 39149, "ðŁĺĺâĿ¤ï¸ı</w>": 39150, "ritz": 39151, "trisha</w>": 39152, "rands</w>": 39153, "middlesex</w>": 39154, "arabs</w>": 39155, "proj</w>": 39156, "sportscenter</w>": 39157, "repeats</w>": 39158, "ivf</w>": 39159, "bleedblue</w>": 39160, "assure</w>": 39161, "obs</w>": 39162, "territorial</w>": 39163, "elen</w>": 39164, "beverley</w>": 39165, "annah": 39166, "âĿ¤ï¸ıâĿ¤ï¸ıâĿ¤ï¸ıâĿ¤ï¸ı": 39167, "zl</w>": 39168, "forgood</w>": 39169, "sciencefiction</w>": 39170, "glau": 39171, "sonya</w>": 39172, "prith": 39173, "stweets</w>": 39174, "mixers</w>": 39175, "mario": 39176, "antelope</w>": 39177, "writingcommunity</w>": 39178, "wentz</w>": 39179, "denham</w>": 39180, "bedi</w>": 39181, "sfo</w>": 39182, "harleydavidson</w>": 39183, "lookbook</w>": 39184, "immunotherapy</w>": 39185, "orphe": 39186, "esville</w>": 39187, "edged</w>": 39188, "task": 39189, "sbball</w>": 39190, "corrosion</w>": 39191, "kilometers</w>": 39192, "costing</w>": 39193, "playback</w>": 39194, "keke</w>": 39195, "divisi": 39196, "uter": 39197, "relocation</w>": 39198, "yelled</w>": 39199, "peng</w>": 39200, "upbeat</w>": 39201, "serve": 39202, "âļł</w>": 39203, "halen</w>": 39204, "stirring</w>": 39205, "rehman</w>": 39206, "env</w>": 39207, "schumacher</w>": 39208, "fragment</w>": 39209, "alkaline</w>": 39210, "sbk</w>": 39211, "resili": 39212, "sharepoint</w>": 39213, "rollover</w>": 39214, "trash": 39215, "counterpart</w>": 39216, "âĻ«": 39217, "obitu": 39218, "à½": 39219, "ãĤ¹</w>": 39220, "mulberry</w>": 39221, "ðŁİĨ</w>": 39222, "autonomy</w>": 39223, "spraying</w>": 39224, "natl": 39225, "loveyou": 39226, "franki": 39227, "nuk</w>": 39228, "escar": 39229, "canteen</w>": 39230, "alibaba</w>": 39231, "deplor": 39232, "molecule</w>": 39233, "pud": 39234, "fortnight</w>": 39235, "blondie</w>": 39236, "sphin": 39237, "portrayal</w>": 39238, "tache</w>": 39239, "bute</w>": 39240, "consisting</w>": 39241, "freepalestine</w>": 39242, "csp</w>": 39243, "immort": 39244, "dns</w>": 39245, "ðŁĴ¥ðŁĴ¥</w>": 39246, "tourde": 39247, "cooking": 39248, "archival</w>": 39249, "gathers</w>": 39250, "bitt</w>": 39251, "banc": 39252, "premature</w>": 39253, "snowball</w>": 39254, "poetryday</w>": 39255, "loudly</w>": 39256, "fugitive</w>": 39257, "eday": 39258, "emra": 39259, "ðŁĩ¸ðŁĩª</w>": 39260, "scien": 39261, "nodejs</w>": 39262, "jurgen</w>": 39263, "jeong</w>": 39264, "bandana</w>": 39265, "unis</w>": 39266, "foxsports</w>": 39267, "vandy": 39268, "provisions</w>": 39269, "weep</w>": 39270, "tuk": 39271, "iko</w>": 39272, "houn": 39273, "ziggy</w>": 39274, "zr</w>": 39275, "fillet</w>": 39276, "bata</w>": 39277, "tink": 39278, "cone": 39279, "wewant": 39280, "kilo": 39281, "horace</w>": 39282, "slt</w>": 39283, "sct</w>": 39284, "staytuned</w>": 39285, "victoria": 39286, "umbria</w>": 39287, "attacker</w>": 39288, "inghamshire</w>": 39289, "frightening</w>": 39290, "noir": 39291, "frat</w>": 39292, "contempt</w>": 39293, "liaison</w>": 39294, "hoi": 39295, "brink": 39296, "trill</w>": 39297, "niagar": 39298, "kickass</w>": 39299, "dundas</w>": 39300, "notmy": 39301, "rhode": 39302, "bumble</w>": 39303, "noxi": 39304, "fag</w>": 39305, "spectators</w>": 39306, "mancrushmonday</w>": 39307, "jinping</w>": 39308, "distract</w>": 39309, "daisy": 39310, "walden</w>": 39311, "portrait": 39312, "arthistory</w>": 39313, "voltron</w>": 39314, "evel": 39315, "isc</w>": 39316, "acm</w>": 39317, "rite": 39318, "nao</w>": 39319, "deported</w>": 39320, "sweats</w>": 39321, "rufus</w>": 39322, "lobo</w>": 39323, "laborday</w>": 39324, "gamo</w>": 39325, "ihrithik</w>": 39326, "blit": 39327, "abdominal</w>": 39328, "ãħ¤ãħ¤ãħ¤ãħ¤": 39329, "iit": 39330, "eq": 39331, "busy": 39332, "alluarjun</w>": 39333, "undisclosed</w>": 39334, "deton": 39335, "procreate</w>": 39336, "kil</w>": 39337, "ðŁİĤðŁİĤ": 39338, "mitchell": 39339, "kii</w>": 39340, "inheritance</w>": 39341, "alp</w>": 39342, "joburg</w>": 39343, "patrolling</w>": 39344, "compulsory</w>": 39345, "unsigned</w>": 39346, "niam": 39347, "lga</w>": 39348, "eshopsuk</w>": 39349, "trilli": 39350, "maw": 39351, "appreciating</w>": 39352, "rockab": 39353, "maÃ±ana</w>": 39354, "antal": 39355, "malvern</w>": 39356, "royo</w>": 39357, "grandprix</w>": 39358, "sutton": 39359, "goftheday</w>": 39360, "digi</w>": 39361, "ãħĭãħĭãħĭãħĭ": 39362, "tles</w>": 39363, "varanasi</w>": 39364, "erected</w>": 39365, "disciples</w>": 39366, "contact": 39367, "ðŁĺµ</w>": 39368, "lid": 39369, "â¬ĩ</w>": 39370, "scentre</w>": 39371, "radiator</w>": 39372, "ingtips</w>": 39373, "transitions</w>": 39374, "thursdaymotivation</w>": 39375, "chemical": 39376, "separati": 39377, "salis</w>": 39378, "mim</w>": 39379, "geographical</w>": 39380, "bookfest</w>": 39381, "/.</w>": 39382, "âľĭ": 39383, "vae</w>": 39384, "currie</w>": 39385, "aggarwal</w>": 39386, "acceleration</w>": 39387, "theses</w>": 39388, "lgm</w>": 39389, "umass</w>": 39390, "proportions</w>": 39391, "nata</w>": 39392, "anians</w>": 39393, "kuch</w>": 39394, "beacons</w>": 39395, "apr": 39396, "@#</w>": 39397, "ðŁĴªðŁı¾</w>": 39398, "nuke</w>": 39399, "sheraton</w>": 39400, "kio</w>": 39401, "makati</w>": 39402, "politico</w>": 39403, "morale</w>": 39404, "ìĻ": 39405, "economically</w>": 39406, "ggly</w>": 39407, "ssen": 39408, "pastries</w>": 39409, "internships</w>": 39410, "vicente</w>": 39411, "fantaken</w>": 39412, "avengers": 39413, "accuse</w>": 39414, "sleepover</w>": 39415, "indicated</w>": 39416, "thedream</w>": 39417, "sterone</w>": 39418, "renders</w>": 39419, "frost": 39420, "oui</w>": 39421, "gregg": 39422, "dore</w>": 39423, "âľ¨âľ¨âľ¨</w>": 39424, "pugs</w>": 39425, "saty": 39426, "numb</w>": 39427, "hemsworth</w>": 39428, "tami</w>": 39429, "lassic</w>": 39430, "schiff</w>": 39431, "iglesias</w>": 39432, "agawa</w>": 39433, "]\"</w>": 39434, "reshi</w>": 39435, "gamestop</w>": 39436, "divorced</w>": 39437, "theater": 39438, "claudi": 39439, "unconventional</w>": 39440, "prophets</w>": 39441, "acin": 39442, "twelf": 39443, "towering</w>": 39444, "tml": 39445, "sclerosis</w>": 39446, "kwan</w>": 39447, "gets": 39448, "disturb</w>": 39449, "naira</w>": 39450, "energ": 39451, "piracy</w>": 39452, "pruitt</w>": 39453, "notified</w>": 39454, "henna</w>": 39455, "bram</w>": 39456, "groundwater</w>": 39457, "bls</w>": 39458, "optimis": 39459, "$)</w>": 39460, "lucie</w>": 39461, "bizhour</w>": 39462, "fangirling</w>": 39463, "grills</w>": 39464, "orl</w>": 39465, "verse": 39466, "cina</w>": 39467, "lawless</w>": 39468, "artistsontwitter</w>": 39469, "televised</w>": 39470, "marshmallows</w>": 39471, "radiohead</w>": 39472, "barr": 39473, "mfc</w>": 39474, "brevi": 39475, "mmorpg</w>": 39476, "gaya</w>": 39477, "âĸ«": 39478, "subtitles</w>": 39479, "jt": 39480, "disneyland": 39481, "tobago</w>": 39482, "nhm</w>": 39483, "groove": 39484, "fiawec</w>": 39485, "\"/</w>": 39486, "bao": 39487, "scrabble</w>": 39488, "omni</w>": 39489, "ffl</w>": 39490, "umc</w>": 39491, "simba</w>": 39492, "alier</w>": 39493, "terrell</w>": 39494, "plume</w>": 39495, "midi": 39496, "dignit": 39497, "coc</w>": 39498, "brut</w>": 39499, "adata</w>": 39500, "alchemy</w>": 39501, "dsm</w>": 39502, "ðŁĺĨðŁĺĨ</w>": 39503, "wintry</w>": 39504, "spares</w>": 39505, "cuer": 39506, "conclusions</w>": 39507, "toys": 39508, "odor</w>": 39509, "flann": 39510, "garvey</w>": 39511, "scriptions</w>": 39512, "inspections</w>": 39513, "catap": 39514, "anglo": 39515, "stlouis</w>": 39516, "heimer</w>": 39517, "atay": 39518, "trich</w>": 39519, "enyc</w>": 39520, "childs</w>": 39521, "ventil": 39522, "montp": 39523, "guillermo</w>": 39524, "circulare": 39525, "zell</w>": 39526, "modeled</w>": 39527, "craftsman</w>": 39528, "alina</w>": 39529, "stimulation</w>": 39530, "cashew</w>": 39531, "judas</w>": 39532, "bestof</w>": 39533, "toire</w>": 39534, "suspends</w>": 39535, "scollege</w>": 39536, "realising</w>": 39537, "bytes</w>": 39538, "bloods</w>": 39539, "assi</w>": 39540, "ðŁĴ¿</w>": 39541, "ohs</w>": 39542, "ðŁįĭ</w>": 39543, "scallop</w>": 39544, "à¤µ</w>": 39545, "gifting</w>": 39546, "camogie</w>": 39547, "wilkes</w>": 39548, "ozzy</w>": 39549, "ðŁ¤¤": 39550, "veronic": 39551, "savoy</w>": 39552, "demetri": 39553, "babygirl</w>": 39554, "ðŁĺįðŁĺŃ</w>": 39555, "sox": 39556, "clyde": 39557, "inductee</w>": 39558, "countdown": 39559, "selfcare</w>": 39560, "à¤ľ</w>": 39561, "vika</w>": 39562, "torre": 39563, "phdchat</w>": 39564, "pears</w>": 39565, "awh</w>": 39566, "suffrage</w>": 39567, "lesn": 39568, "admiration</w>": 39569, "mpp</w>": 39570, "sharkweek</w>": 39571, "schulz</w>": 39572, "santorini</w>": 39573, "clover": 39574, "(*</w>": 39575, "strasbourg</w>": 39576, "exiting</w>": 39577, "soyu": 39578, "fingerprint</w>": 39579, "chea</w>": 39580, "ãĢľ</w>": 39581, "vindic": 39582, "songwriters</w>": 39583, "soa</w>": 39584, "prouder</w>": 39585, "nama</w>": 39586, "=))</w>": 39587, "simplest</w>": 39588, "deliciously</w>": 39589, "gilles</w>": 39590, "uq</w>": 39591, "mnwx</w>": 39592, "epp</w>": 39593, "shun</w>": 39594, "kennel</w>": 39595, "fallon": 39596, "ðŁĲ£</w>": 39597, "sind": 39598, "tragically</w>": 39599, "outes</w>": 39600, "modernism</w>": 39601, "coke": 39602, "gyn</w>": 39603, "spion": 39604, "âĺ¹ï¸ı</w>": 39605, "leam": 39606, "compressor</w>": 39607, "apologise</w>": 39608, "twentyon": 39609, "fanatics</w>": 39610, "âĻ»": 39611, "scotsman</w>": 39612, "sawa</w>": 39613, "kou</w>": 39614, "aser": 39615, "à¸ļ</w>": 39616, "welterweight</w>": 39617, "phenom</w>": 39618, "twickenham</w>": 39619, "stria</w>": 39620, "pout</w>": 39621, "kaz</w>": 39622, "giam": 39623, "cdp</w>": 39624, "hoy": 39625, "employ</w>": 39626, "redmond</w>": 39627, "à¸Ħà¸": 39628, "smere</w>": 39629, "trancefamily</w>": 39630, "protocols</w>": 39631, "piece": 39632, "luiz</w>": 39633, "iteracy</w>": 39634, "carls": 39635, "unitedstates</w>": 39636, "harmed</w>": 39637, "phdlife</w>": 39638, "chaw": 39639, "footprints</w>": 39640, "lÃ©": 39641, "choker</w>": 39642, "zana</w>": 39643, "slipper</w>": 39644, "ericsson</w>": 39645, "insulting</w>": 39646, "artichoke</w>": 39647, "advising</w>": 39648, "acquisitions</w>": 39649, "opor": 39650, "mutations</w>": 39651, "rear": 39652, "à¥ģ</w>": 39653, "podcast": 39654, "wither": 39655, "kung": 39656, "íĺ¸</w>": 39657, "winslow</w>": 39658, "diapers</w>": 39659, "ðŁĵ¸@</w>": 39660, "ecker": 39661, "collar": 39662, "huey</w>": 39663, "giro": 39664, "monogram</w>": 39665, "kasich</w>": 39666, "siveness</w>": 39667, "malaysi": 39668, "aromatic</w>": 39669, "gres</w>": 39670, "galileo</w>": 39671, "uji</w>": 39672, "robb</w>": 39673, "drm</w>": 39674, "nonetheless</w>": 39675, "asa": 39676, ":></w>": 39677, "loa</w>": 39678, "lnp</w>": 39679, "atwork</w>": 39680, "agt</w>": 39681, "lakshmi</w>": 39682, "pipelines</w>": 39683, "idal</w>": 39684, "strel</w>": 39685, "reall": 39686, "chainz</w>": 39687, "stonewall</w>": 39688, "sansk": 39689, "ðŁı´": 39690, "piedmont</w>": 39691, "hostess</w>": 39692, "ciu": 39693, "tÃ©</w>": 39694, "analyses</w>": 39695, "wilhelm</w>": 39696, "scotty": 39697, "rwby</w>": 39698, "mosquit": 39699, "usemb": 39700, "quins</w>": 39701, "ðŁĳİ": 39702, "tucker": 39703, "sconf</w>": 39704, "specifications</w>": 39705, "psychiatry</w>": 39706, "brookes</w>": 39707, "sils</w>": 39708, "olaf</w>": 39709, "deto": 39710, "codi": 39711, "clip": 39712, "filth</w>": 39713, "womancrushwednesday</w>": 39714, "goto": 39715, "angerous</w>": 39716, "beale</w>": 39717, "wtc</w>": 39718, "panelist</w>": 39719, "nex</w>": 39720, "larsen</w>": 39721, "emilio</w>": 39722, "tableau</w>": 39723, "hitters</w>": 39724, "conceived</w>": 39725, "americani": 39726, "ortega</w>": 39727, "mardi": 39728, "Ñĥ": 39729, "paintball</w>": 39730, "thirsty": 39731, "newyorker</w>": 39732, "etisation</w>": 39733, "goss": 39734, "weaker</w>": 39735, "ugh": 39736, "troll": 39737, "harga</w>": 39738, "dual": 39739, "ghtning</w>": 39740, "atine</w>": 39741, "ðŁĺİðŁĺİðŁĺİ</w>": 39742, "cookout</w>": 39743, "pyrenees</w>": 39744, "poss</w>": 39745, "authentication</w>": 39746, "sportswear</w>": 39747, "yunho</w>": 39748, "kiro</w>": 39749, "archipel": 39750, "shenko</w>": 39751, "render": 39752, "novation</w>": 39753, "divinity</w>": 39754, "ðŁĳ£</w>": 39755, "sufi</w>": 39756, "humbling</w>": 39757, "geopol": 39758, "devotees</w>": 39759, "waitress</w>": 39760, "trough</w>": 39761, "pyro": 39762, "iba</w>": 39763, "bling": 39764, "graf</w>": 39765, "epilots</w>": 39766, "btr</w>": 39767, "oftball</w>": 39768, "basking</w>": 39769, "dominos</w>": 39770, "soom": 39771, "rath</w>": 39772, "sheryl</w>": 39773, "quel": 39774, "astronomical</w>": 39775, "weld</w>": 39776, "tracklist</w>": 39777, "signee</w>": 39778, "sleepless</w>": 39779, "comman": 39780, "chron</w>": 39781, "summon</w>": 39782, "puremichigan</w>": 39783, "crispr</w>": 39784, "slip": 39785, "lagi</w>": 39786, "raq": 39787, "umu</w>": 39788, "thalap": 39789, "charmed</w>": 39790, "scrump": 39791, "quadcopter</w>": 39792, "skip": 39793, "petersen</w>": 39794, "muni</w>": 39795, "ðŁĮ¾</w>": 39796, "monaghan</w>": 39797, "trays</w>": 39798, "icked</w>": 39799, "canadaday</w>": 39800, "tegr": 39801, "ï¿½</w>": 39802, "hotness</w>": 39803, "heavymetal</w>": 39804, "abar</w>": 39805, "gopdebate</w>": 39806, "azul</w>": 39807, "spiderman": 39808, "sunflowers</w>": 39809, "ľë": 39810, "webcomics</w>": 39811, "bard": 39812, "Ð²": 39813, "nicholas": 39814, "slush</w>": 39815, "raman</w>": 39816, "markham</w>": 39817, "fficial": 39818, "ffler</w>": 39819, "íĬ¸</w>": 39820, "pless": 39821, "anushka</w>": 39822, "toto</w>": 39823, "skaters</w>": 39824, "prowrestling</w>": 39825, "competes</w>": 39826, "ayala</w>": 39827, "mystery": 39828, "thrills</w>": 39829, "mpg": 39830, "independently</w>": 39831, "yul": 39832, "imperative</w>": 39833, "formidable</w>": 39834, "tireless</w>": 39835, "stacking</w>": 39836, "tongues</w>": 39837, "maltese</w>": 39838, "potts</w>": 39839, "matti": 39840, "charting</w>": 39841, "chillout</w>": 39842, "supernova</w>": 39843, "omeo</w>": 39844, "skysports": 39845, "nutty</w>": 39846, "ðŁĹĵï¸ı</w>": 39847, "rohan</w>": 39848, "inspired": 39849, "concierge</w>": 39850, "serra</w>": 39851, "makk": 39852, "galat": 39853, "chipp": 39854, "yev</w>": 39855, "ì£": 39856, "reimbur": 39857, "opul": 39858, "kimberley</w>": 39859, "ieee</w>": 39860, "bremen</w>": 39861, "chitec": 39862, "orin</w>": 39863, "naku": 39864, "bonkers</w>": 39865, "footy": 39866, "emergence</w>": 39867, "ðŁĨĺ</w>": 39868, "stip": 39869, "sergei</w>": 39870, "zoey</w>": 39871, "aime</w>": 39872, "would": 39873, "dyes</w>": 39874, "destiny": 39875, "vinaigrette</w>": 39876, "drier</w>": 39877, "circulareconomy</w>": 39878, "anarchi": 39879, "ssr</w>": 39880, "schel": 39881, "ciner": 39882, "groom": 39883, "determining</w>": 39884, "garmin</w>": 39885, "calais</w>": 39886, "incarceration</w>": 39887, "bukit</w>": 39888, "noi</w>": 39889, "chelmsford</w>": 39890, "mckinley</w>": 39891, "chipped</w>": 39892, "belonged</w>": 39893, "tumors</w>": 39894, "stroud</w>": 39895, "mii</w>": 39896, "influenza</w>": 39897, "wwenxt</w>": 39898, "tundra</w>": 39899, "telecommunications</w>": 39900, "catsofinstagram</w>": 39901, "tages</w>": 39902, "beatty</w>": 39903, "odu</w>": 39904, "mlkday</w>": 39905, "ooper</w>": 39906, "dangle</w>": 39907, "akley</w>": 39908, "crumb</w>": 39909, "antigua</w>": 39910, "timbers</w>": 39911, "rouhani</w>": 39912, "ðŁĴªðŁĴªðŁĴª</w>": 39913, "hafi": 39914, "...!!</w>": 39915, "wcs</w>": 39916, "coop": 39917, "snc</w>": 39918, "litres</w>": 39919, "ãĢĬ</w>": 39920, "haz</w>": 39921, "coz": 39922, "kant": 39923, "greenfield</w>": 39924, "curti": 39925, "yale": 39926, "flyeagles": 39927, "whatsoever</w>": 39928, "worthing</w>": 39929, "roulette</w>": 39930, "flyeaglesfly</w>": 39931, "unda</w>": 39932, "ainted</w>": 39933, "standing": 39934, "luscious</w>": 39935, "hpc</w>": 39936, "efficacy</w>": 39937, "ashland</w>": 39938, "meghan": 39939, "kywx</w>": 39940, "npr": 39941, "bathtub</w>": 39942, "acos</w>": 39943, "hani": 39944, "marcor": 39945, "mantis</w>": 39946, "daisi": 39947, "boba</w>": 39948, "abbie</w>": 39949, "mutil": 39950, "vial</w>": 39951, "spyder</w>": 39952, "poz": 39953, "gti</w>": 39954, "elfie</w>": 39955, "nightw": 39956, "metroid</w>": 39957, "antoni": 39958, "maddie": 39959, "dhry</w>": 39960, "darlings</w>": 39961, "tends</w>": 39962, "taekwondo</w>": 39963, "atlanta": 39964, "meow": 39965, "chloe": 39966, "ãĥİ</w>": 39967, "ymes</w>": 39968, "siberia</w>": 39969, "kcon</w>": 39970, "gues": 39971, "mariner</w>": 39972, "facil": 39973, "azzle</w>": 39974, "[...": 39975, "hannover</w>": 39976, "bavaria</w>": 39977, "virgo</w>": 39978, "teuk</w>": 39979, "usps</w>": 39980, ")#</w>": 39981, "walla</w>": 39982, "sampson</w>": 39983, "needless</w>": 39984, "verbally</w>": 39985, "hayley": 39986, "bowled</w>": 39987, "pius</w>": 39988, "lampard</w>": 39989, "hamstring</w>": 39990, "volvo": 39991, "roadsafety</w>": 39992, "choking</w>": 39993, "sorbet</w>": 39994, "ahem</w>": 39995, "healthyfood</w>": 39996, "braided</w>": 39997, "horticulture</w>": 39998, "crative</w>": 39999, "cheek": 40000, "addo</w>": 40001, "theforce": 40002, "koko</w>": 40003, "schizoph": 40004, "jie</w>": 40005, "wada</w>": 40006, "twentyonepilots</w>": 40007, "hbcu</w>": 40008, "proton</w>": 40009, "pauls</w>": 40010, "louisa</w>": 40011, "latam</w>": 40012, "kyrgy": 40013, "compac": 40014, "sdk</w>": 40015, "sapi": 40016, "???": 40017, "liberalism</w>": 40018, "epsilon</w>": 40019, "aiden</w>": 40020, "wusa</w>": 40021, "sprayed</w>": 40022, "basketball": 40023, "kimono</w>": 40024, "bluewave</w>": 40025, "alias</w>": 40026, "ë§Ī": 40027, "mugshot</w>": 40028, "cec</w>": 40029, "dogre": 40030, "adora</w>": 40031, "ðŁĵ·@</w>": 40032, "krakow</w>": 40033, "intrigued</w>": 40034, "exhausting</w>": 40035, "astronomer</w>": 40036, "venison</w>": 40037, "ladybug</w>": 40038, "civ": 40039, "brae</w>": 40040, "usm</w>": 40041, "bribe</w>": 40042, "acupuncture</w>": 40043, "pembroke</w>": 40044, "keating</w>": 40045, "chie": 40046, "yad</w>": 40047, "tsi": 40048, "smi</w>": 40049, "seeding</w>": 40050, "gateshead</w>": 40051, "lisboa</w>": 40052, "gyp": 40053, "canvass</w>": 40054, "ðŁĶ´âļªï¸ı</w>": 40055, "opi": 40056, "nir</w>": 40057, "societal</w>": 40058, "lyte</w>": 40059, "aties</w>": 40060, "csm</w>": 40061, "artery</w>": 40062, "alin</w>": 40063, "akapoor</w>": 40064, "abstracts</w>": 40065, "âĢ¦âĢ¦</w>": 40066, "teenwolf</w>": 40067, "newe": 40068, "travelgram</w>": 40069, "sentimental</w>": 40070, "perched</w>": 40071, "handel</w>": 40072, "hoek</w>": 40073, "fay</w>": 40074, "coordinating</w>": 40075, "animate</w>": 40076, "manian</w>": 40077, "effort": 40078, "jerky</w>": 40079, "fck": 40080, "adrienne</w>": 40081, "mably</w>": 40082, "trading": 40083, "myel": 40084, "spiro": 40085, "sola</w>": 40086, "storing</w>": 40087, "overdrive</w>": 40088, "mondaymorning</w>": 40089, "dreamteam</w>": 40090, "pulse": 40091, "bondi</w>": 40092, "bernie": 40093, "pgatour</w>": 40094, "tripoli</w>": 40095, "sonam": 40096, "platt</w>": 40097, "âļ¡": 40098, "agroup</w>": 40099, "îĲĴ": 40100, "invading</w>": 40101, "vcu</w>": 40102, "kell</w>": 40103, "Ã±os</w>": 40104, "undead</w>": 40105, "podcasting</w>": 40106, "mercedesam": 40107, "manafort</w>": 40108, "cortex</w>": 40109, "queso</w>": 40110, "impeccable</w>": 40111, "palmer": 40112, "wildoz</w>": 40113, "sportsc": 40114, "guacamole</w>": 40115, "dispenser</w>": 40116, "categori": 40117, "stunts</w>": 40118, "peril": 40119, "invitations</w>": 40120, "dunedin</w>": 40121, "xie": 40122, "achieves</w>": 40123, "safer": 40124, "preds</w>": 40125, "phan</w>": 40126, "knuckles</w>": 40127, "kak</w>": 40128, "ignores</w>": 40129, "lovemyjob</w>": 40130, "aruba</w>": 40131, "oundation</w>": 40132, "datacenter</w>": 40133, "covert</w>": 40134, "gring</w>": 40135, "couple": 40136, "Ø§Ø±": 40137, "voli</w>": 40138, "mccle": 40139, "artisans</w>": 40140, "ludo": 40141, "kalam</w>": 40142, "aroma": 40143, "undertaker</w>": 40144, "hula</w>": 40145, "wizkid</w>": 40146, "gumb": 40147, "godfrey</w>": 40148, "bakersfield</w>": 40149, "kern</w>": 40150, "engineer": 40151, "carve</w>": 40152, "palin</w>": 40153, "guarantees</w>": 40154, "pebbles</w>": 40155, "bays</w>": 40156, "zieg": 40157, "fink</w>": 40158, "â¬ĩï¸ıâ¬ĩï¸ı": 40159, "downpours</w>": 40160, "rochelle</w>": 40161, "raspberry": 40162, "ðŁĺ®": 40163, "graphies</w>": 40164, "stomp</w>": 40165, "cafes</w>": 40166, "arized</w>": 40167, "uttar</w>": 40168, "calvary</w>": 40169, "drie</w>": 40170, "crusader</w>": 40171, "busan</w>": 40172, "tuxedo</w>": 40173, "siu</w>": 40174, "seamus</w>": 40175, "cultured</w>": 40176, "blanchard</w>": 40177, "townhouse</w>": 40178, "gered</w>": 40179, "buttermilk</w>": 40180, "fluctu": 40181, "rogerfederer</w>": 40182, "heli</w>": 40183, "ðŁ¦ĥ</w>": 40184, "uous</w>": 40185, "ramesh</w>": 40186, "muppets</w>": 40187, "emailmarketing</w>": 40188, "yess</w>": 40189, "brice</w>": 40190, "rizio</w>": 40191, "pelo": 40192, "donneinarte</w>": 40193, "urable</w>": 40194, "investin": 40195, "bumping</w>": 40196, "rajiv</w>": 40197, "sava</w>": 40198, "thrower</w>": 40199, "forex": 40200, "ohhhh</w>": 40201, "thrust</w>": 40202, "pullman</w>": 40203, "rfid</w>": 40204, "sepsis</w>": 40205, "leed</w>": 40206, "fright</w>": 40207, "rounding</w>": 40208, "neb": 40209, "phins</w>": 40210, "aisha</w>": 40211, "utilizing</w>": 40212, "squats</w>": 40213, "goldsmith</w>": 40214, "jic</w>": 40215, "boks</w>": 40216, "vaus</w>": 40217, "ipo": 40218, "exclusion</w>": 40219, "tariff</w>": 40220, "pokes</w>": 40221, "minal</w>": 40222, "lands": 40223, "enforce</w>": 40224, "washingtondc</w>": 40225, "orchar": 40226, "gx</w>": 40227, "marys</w>": 40228, "eyour": 40229, "aussie": 40230, "bakers</w>": 40231, "unpopular</w>": 40232, "latinos</w>": 40233, "large": 40234, "putnam</w>": 40235, "bolo</w>": 40236, "wade": 40237, "pelo</w>": 40238, "dizz": 40239, "obstruction</w>": 40240, "flappy</w>": 40241, "wearethe": 40242, "dependence</w>": 40243, "pajama</w>": 40244, "ete</w>": 40245, "yann": 40246, "ewan</w>": 40247, "discla": 40248, "aay</w>": 40249, "karina</w>": 40250, "eic": 40251, "antrim</w>": 40252, "wsoc</w>": 40253, "negatively</w>": 40254, "kaido</w>": 40255, "fotografia</w>": 40256, "dhru": 40257, "colossal</w>": 40258, "mcleod</w>": 40259, "kwang</w>": 40260, "manipu": 40261, "exhilar": 40262, "usatoday</w>": 40263, "summerslam</w>": 40264, "coles": 40265, "taproom</w>": 40266, "unbeatable</w>": 40267, "dema": 40268, "ticks</w>": 40269, "kling": 40270, "fils</w>": 40271, "campaigners</w>": 40272, "à¸ķ": 40273, "brewster</w>": 40274, "audubon</w>": 40275, "quay": 40276, "chs": 40277, "kigali</w>": 40278, "dler</w>": 40279, "strengthens</w>": 40280, "somal": 40281, "signingday</w>": 40282, "golds</w>": 40283, "pigment</w>": 40284, "orchestral</w>": 40285, "gq": 40286, "linkin": 40287, "ðŁıĩ</w>": 40288, "taw</w>": 40289, "algarve</w>": 40290, "hov</w>": 40291, "earle</w>": 40292, "goldfish</w>": 40293, "amig": 40294, "exer": 40295, "benin</w>": 40296, "druid</w>": 40297, "ðŁĲ¸</w>": 40298, "shem</w>": 40299, "quattro</w>": 40300, "mercen": 40301, "mente": 40302, "incorporating</w>": 40303, "bonanza</w>": 40304, "statefair</w>": 40305, "ende</w>": 40306, "conceptions</w>": 40307, "ees</w>": 40308, "âĻ¥ï¸ıâĻ¥ï¸ı": 40309, "dson</w>": 40310, "firearm</w>": 40311, "orbital</w>": 40312, "weh</w>": 40313, "multip": 40314, "fob</w>": 40315, "requiem</w>": 40316, "plight</w>": 40317, "thouse": 40318, "said": 40319, "ocre</w>": 40320, "remembrance": 40321, "nold</w>": 40322, "chipping</w>": 40323, "bev": 40324, "ert": 40325, "cathy": 40326, "sym</w>": 40327, "riggs</w>": 40328, "mley</w>": 40329, "dialogues</w>": 40330, "slender</w>": 40331, "howl</w>": 40332, "gauteng</w>": 40333, "wdw</w>": 40334, "tobi": 40335, "smokes</w>": 40336, "implo": 40337, "bpm</w>": 40338, "adn</w>": 40339, "mombasa</w>": 40340, "capsul": 40341, "bloomfield</w>": 40342, "articul": 40343, "cleo</w>": 40344, "googled</w>": 40345, "fluffy": 40346, "lard</w>": 40347, "enzyme</w>": 40348, "vesti": 40349, "ibrahi": 40350, "flame": 40351, "emea</w>": 40352, "outages</w>": 40353, "dispropor": 40354, "bleak</w>": 40355, "ansel": 40356, "icker</w>": 40357, "stlouis": 40358, "stockmarket</w>": 40359, "goodfriday</w>": 40360, "sault</w>": 40361, "stalled</w>": 40362, "prom": 40363, "epsom</w>": 40364, "bÃ©": 40365, "these": 40366, "sauces</w>": 40367, "mew</w>": 40368, "litfest</w>": 40369, "pred": 40370, "reu</w>": 40371, "karak": 40372, "sienna</w>": 40373, "ellin</w>": 40374, "biotechnology</w>": 40375, "ï¸ıâĥ£-</w>": 40376, "tactic</w>": 40377, "sain</w>": 40378, "pork": 40379, "monza</w>": 40380, "kaj</w>": 40381, "lush": 40382, "compartment</w>": 40383, "changing": 40384, "shraddhakapoor</w>": 40385, "foal</w>": 40386, "artem": 40387, "cuando</w>": 40388, "canola</w>": 40389, "oriente": 40390, "messe</w>": 40391, "dited</w>": 40392, "brc</w>": 40393, "boxer": 40394, "bbctwo</w>": 40395, "sst</w>": 40396, "mentday</w>": 40397, "eming</w>": 40398, "dewey</w>": 40399, "kofi</w>": 40400, "âŀĸâŀĸâŀĸâŀĸ": 40401, "realization</w>": 40402, "smol</w>": 40403, "twood": 40404, "sanje": 40405, "flagstaff</w>": 40406, "berwick</w>": 40407, "corset</w>": 40408, "canary": 40409, "whistleblower</w>": 40410, "etched</w>": 40411, "composing</w>": 40412, "squeezed</w>": 40413, "bower</w>": 40414, "autodesk</w>": 40415, "neh": 40416, "mathieu</w>": 40417, "baja": 40418, "ÅĤ": 40419, "hydra</w>": 40420, "daim": 40421, "ameri": 40422, "insisted</w>": 40423, "merlot</w>": 40424, "garros</w>": 40425, "heartnews</w>": 40426, "gainesville</w>": 40427, "cutler</w>": 40428, "bode</w>": 40429, "ðŁĺīðŁĺī</w>": 40430, "lewes</w>": 40431, "scountry</w>": 40432, "gsa</w>": 40433, "usu</w>": 40434, "ccm</w>": 40435, "godawgs</w>": 40436, "pharaoh</w>": 40437, "crae</w>": 40438, "morley</w>": 40439, "hypnoti": 40440, "fades</w>": 40441, "neurons</w>": 40442, "fuzz</w>": 40443, "ingco</w>": 40444, "highlanders</w>": 40445, "stark": 40446, "vigne": 40447, "packets</w>": 40448, "amarillo</w>": 40449, "reuben</w>": 40450, "insults</w>": 40451, "basic": 40452, "vector": 40453, "nme</w>": 40454, "acruz</w>": 40455, "tros</w>": 40456, "transmitter</w>": 40457, "ðŁĺŀ": 40458, "interpret</w>": 40459, "ðŁĺ²": 40460, "prequel</w>": 40461, "mcgowan</w>": 40462, "dissemin": 40463, "ðŁĴĺðŁĴĺ</w>": 40464, "masculinity</w>": 40465, "indiegamedev</w>": 40466, "alive": 40467, "tet": 40468, "petal</w>": 40469, "emailed</w>": 40470, "armed": 40471, "koo</w>": 40472, "heer</w>": 40473, "baird</w>": 40474, "superjunior</w>": 40475, "metropolis</w>": 40476, "delavin": 40477, "declines</w>": 40478, "stitutes</w>": 40479, "Ûģ": 40480, "ptbo</w>": 40481, "glan": 40482, "chores</w>": 40483, "ealing</w>": 40484, "chrissy</w>": 40485, "stemc": 40486, "vian": 40487, "assassinated</w>": 40488, "pronounce</w>": 40489, "illegals</w>": 40490, "discovery": 40491, "cavill</w>": 40492, "frifotos</w>": 40493, "fal</w>": 40494, "soi</w>": 40495, "sabotage</w>": 40496, "tint</w>": 40497, "pdc</w>": 40498, "ðŁİīðŁİĪ": 40499, "ãĤĬãģ": 40500, "jio</w>": 40501, "endeavor</w>": 40502, "insig": 40503, "committees</w>": 40504, "shearer</w>": 40505, "metz</w>": 40506, "marrying</w>": 40507, "hdd</w>": 40508, "gby</w>": 40509, "fret</w>": 40510, "trish": 40511, "pul</w>": 40512, "scripted</w>": 40513, "saki</w>": 40514, "lw": 40515, "keye": 40516, "shimi</w>": 40517, "nanaimo</w>": 40518, "cah</w>": 40519, "Ã«</w>": 40520, "tempered</w>": 40521, "ician": 40522, "dugg": 40523, "dishwasher</w>": 40524, "airfield</w>": 40525, "srugby</w>": 40526, "grinch</w>": 40527, "yst": 40528, "rms</w>": 40529, "mahatma</w>": 40530, "lankan</w>": 40531, "discar": 40532, "digestion</w>": 40533, "nodes</w>": 40534, "lls</w>": 40535, "omic": 40536, "gutter</w>": 40537, "tisgarh</w>": 40538, "federico</w>": 40539, "electionday</w>": 40540, "bohe": 40541, "mastercard</w>": 40542, "fireball</w>": 40543, "âľĶï¸ı": 40544, "oyster": 40545, "pong": 40546, "dok": 40547, "enroute</w>": 40548, "mvc</w>": 40549, "beatthe": 40550, "alistair</w>": 40551, "shub</w>": 40552, "shaming</w>": 40553, "chernobyl</w>": 40554, "ghibli</w>": 40555, "thes": 40556, "pinion</w>": 40557, "dbs</w>": 40558, "salts</w>": 40559, "iction</w>": 40560, "epiph": 40561, "ncpol</w>": 40562, "inconvenience</w>": 40563, "whitley</w>": 40564, "inspecting</w>": 40565, "woodley</w>": 40566, "wiener</w>": 40567, "skillet</w>": 40568, "noles</w>": 40569, "mca": 40570, "hina</w>": 40571, "asha": 40572, "willingness</w>": 40573, "wellness": 40574, "tamed</w>": 40575, "showtime": 40576, "disadvantaged</w>": 40577, "bernat": 40578, "usn</w>": 40579, "missionaries</w>": 40580, "counselling</w>": 40581, "arrogant</w>": 40582, "quantitative</w>": 40583, "legalization</w>": 40584, "hodge</w>": 40585, "energyefficiency</w>": 40586, "camerondallas</w>": 40587, "possessions</w>": 40588, "pbb</w>": 40589, "harrisburg</w>": 40590, "vg": 40591, "hinduism</w>": 40592, "happythanksgiving</w>": 40593, "fib</w>": 40594, "reacting</w>": 40595, "tweetapicture": 40596, "politi": 40597, "muppet</w>": 40598, "hurrah</w>": 40599, "pace": 40600, "coastguard</w>": 40601, "guarded</w>": 40602, "asam": 40603, "parry</w>": 40604, "forevery": 40605, "xq</w>": 40606, "oomf</w>": 40607, "keanu</w>": 40608, "jind": 40609, "rist</w>": 40610, "customerservice</w>": 40611, "sacred": 40612, "ðŁĺº</w>": 40613, "toner</w>": 40614, "occurrence</w>": 40615, "matu": 40616, "valdez</w>": 40617, "redd</w>": 40618, "isak": 40619, "powerrangers</w>": 40620, "peasant</w>": 40621, "rajini</w>": 40622, "abraham": 40623, "emil</w>": 40624, "cardo</w>": 40625, "tril": 40626, "hairstyles</w>": 40627, "obsolete</w>": 40628, "sampler</w>": 40629, "directive</w>": 40630, "delavinkisses</w>": 40631, "verton</w>": 40632, "glos</w>": 40633, "spay": 40634, "palermo</w>": 40635, "comets</w>": 40636, "manziel</w>": 40637, "chicagof": 40638, "skipped</w>": 40639, "pictorial</w>": 40640, "hant": 40641, "bmi</w>": 40642, "aol</w>": 40643, "reopens</w>": 40644, "paddling</w>": 40645, "devos</w>": 40646, "fraud": 40647, "baseline</w>": 40648, "queues</w>": 40649, "spired</w>": 40650, "snare</w>": 40651, "euve</w>": 40652, "descriptions</w>": 40653, "daisies</w>": 40654, "caching</w>": 40655, "galleria</w>": 40656, "trimmed</w>": 40657, "stino</w>": 40658, "recycla": 40659, "icular</w>": 40660, "birken": 40661, "rawlings</w>": 40662, "flix</w>": 40663, "chicas</w>": 40664, "bgt</w>": 40665, "likeli": 40666, "argyll</w>": 40667, "thelove": 40668, "gaston</w>": 40669, "blanca</w>": 40670, "hak</w>": 40671, "fone</w>": 40672, "sailormoon</w>": 40673, "haci": 40674, "imac</w>": 40675, "flyn": 40676, "decan": 40677, "belles</w>": 40678, "apic": 40679, "zog</w>": 40680, "taunton</w>": 40681, "constance</w>": 40682, "lasagna</w>": 40683, "kernel</w>": 40684, "inka</w>": 40685, "harbor": 40686, "collectively</w>": 40687, "calculated</w>": 40688, "aville</w>": 40689, "shilpa</w>": 40690, "purdu": 40691, "gimm": 40692, "funer": 40693, "aest</w>": 40694, "pembrokeshire</w>": 40695, "nightingale</w>": 40696, "nunes</w>": 40697, "hypertension</w>": 40698, "hubert</w>": 40699, "sliders</w>": 40700, "infertility</w>": 40701, "commended</w>": 40702, "transatlantic</w>": 40703, "metrical</w>": 40704, "!!@</w>": 40705, "ÅŁ</w>": 40706, "ssg</w>": 40707, "bacca</w>": 40708, "inverted</w>": 40709, "funfactfriday</w>": 40710, "itans</w>": 40711, "album": 40712, "acquainted</w>": 40713, "rier": 40714, "whelan</w>": 40715, "sarab": 40716, "mue</w>": 40717, "snooze</w>": 40718, "piff</w>": 40719, "agreeing</w>": 40720, "spitting</w>": 40721, "jermaine</w>": 40722, "nye": 40723, "âľıï¸ı</w>": 40724, "ambush</w>": 40725, "zeph": 40726, "congreg": 40727, "university": 40728, "sapp</w>": 40729, "wannabe</w>": 40730, "patrice</w>": 40731, "ibd</w>": 40732, "doglo": 40733, "fridges</w>": 40734, "sund</w>": 40735, "kingston": 40736, "argon": 40737, "kamen</w>": 40738, "hardrock</w>": 40739, "dsley</w>": 40740, "dolores</w>": 40741, "ì°": 40742, "otaku</w>": 40743, "piping</w>": 40744, "behaving</w>": 40745, "âŃĲï¸ıâŃĲï¸ıâŃĲï¸ı</w>": 40746, "bluebird</w>": 40747, "ansari</w>": 40748, "teapot</w>": 40749, "firework</w>": 40750, "crop": 40751, "logans</w>": 40752, "typed</w>": 40753, "thickness</w>": 40754, "igers": 40755, "cfp</w>": 40756, "dysfunctional</w>": 40757, "contrasting</w>": 40758, "etty</w>": 40759, "astonmartin</w>": 40760, "txst</w>": 40761, "dragrace</w>": 40762, "attributes</w>": 40763, "marathon": 40764, "manuscripts</w>": 40765, "johnstone</w>": 40766, "ðŁĺ±ðŁĺ±</w>": 40767, "boer</w>": 40768, "ayu</w>": 40769, "arugula</w>": 40770, "poorest</w>": 40771, "condu": 40772, "assumption</w>": 40773, "anagh</w>": 40774, "noh</w>": 40775, "delavin</w>": 40776, "sitter</w>": 40777, "gÃ¶": 40778, "morow</w>": 40779, "kickstart</w>": 40780, "comi": 40781, "glacial</w>": 40782, "ghead</w>": 40783, "bain": 40784, "kershaw</w>": 40785, "endof": 40786, "freud</w>": 40787, "omat": 40788, "iaf</w>": 40789, "hug": 40790, "signup</w>": 40791, "eachother</w>": 40792, "definite</w>": 40793, "tubing</w>": 40794, "shakira</w>": 40795, "ðŁĳıðŁı½": 40796, "uuuu</w>": 40797, "swin</w>": 40798, "shambles</w>": 40799, "olas</w>": 40800, "skell</w>": 40801, "britain": 40802, "knw</w>": 40803, "clutter</w>": 40804, "omy": 40805, "jens</w>": 40806, "hanged</w>": 40807, "cityscape</w>": 40808, "scraps</w>": 40809, "unlocking</w>": 40810, "deadliest</w>": 40811, "erno</w>": 40812, "breastcancer": 40813, "ait</w>": 40814, "inspect</w>": 40815, "furi": 40816, "ðŁĴĮ</w>": 40817, "kud": 40818, "jule": 40819, "orah</w>": 40820, "mids</w>": 40821, "mdt</w>": 40822, "burgring</w>": 40823, "rattle": 40824, "pusa</w>": 40825, "stalk": 40826, "cleans</w>": 40827, "issance</w>": 40828, "zek</w>": 40829, "worthit</w>": 40830, "nameis": 40831, "muskoka</w>": 40832, "councilman</w>": 40833, "urbanart</w>": 40834, "barrac": 40835, "unsolved</w>": 40836, "tul</w>": 40837, "gita</w>": 40838, "whiteboard</w>": 40839, "soybeans</w>": 40840, "ement": 40841, "conti</w>": 40842, "saturdaymotivation</w>": 40843, "conveniently</w>": 40844, "docking</w>": 40845, "tado</w>": 40846, "âı©</w>": 40847, "spino": 40848, "puppylove</w>": 40849, "pof": 40850, "fabricated</w>": 40851, "robbers</w>": 40852, "adopts</w>": 40853, "tified</w>": 40854, "kkr</w>": 40855, "indulgence</w>": 40856, "noticeable</w>": 40857, "macquarie</w>": 40858, "chapel": 40859, "sensual</w>": 40860, "kiko</w>": 40861, "melanoma</w>": 40862, "loretta</w>": 40863, "liance</w>": 40864, "aben": 40865, "splus</w>": 40866, "gaal</w>": 40867, "acele": 40868, "libdems</w>": 40869, "comparisons</w>": 40870, "ðŁĮµ</w>": 40871, "rhythms</w>": 40872, "mery</w>": 40873, "encapsul": 40874, "napier</w>": 40875, "ðŁĳĮðŁĳĮðŁĳĮ</w>": 40876, "ðŁĳĲ</w>": 40877, "platz</w>": 40878, "fresno": 40879, "reformed</w>": 40880, "ranbir</w>": 40881, "elit": 40882, "thebest": 40883, "bhushan</w>": 40884, "vinnie</w>": 40885, "improvised</w>": 40886, "sittin</w>": 40887, "recreated</w>": 40888, "eba</w>": 40889, "ecker</w>": 40890, "acrob": 40891, "ponte</w>": 40892, "cord": 40893, "giddy</w>": 40894, "eurusd</w>": 40895, "fever": 40896, "intuition</w>": 40897, "gari": 40898, "dummies</w>": 40899, "budweiser</w>": 40900, "amendments</w>": 40901, "tetra": 40902, "schnit": 40903, "ayas</w>": 40904, "marys": 40905, "cist</w>": 40906, "kani": 40907, "kermit</w>": 40908, "ðŁĺ±ðŁĺ±ðŁĺ±</w>": 40909, "tinker</w>": 40910, "strolling</w>": 40911, "divisional</w>": 40912, "nigeri": 40913, "ominous</w>": 40914, "menstrual</w>": 40915, "karab": 40916, "khy": 40917, "bwfc</w>": 40918, "panhandle</w>": 40919, "lilli": 40920, "weller</w>": 40921, "strapped</w>": 40922, "sonthe": 40923, "transferring</w>": 40924, "ethereal</w>": 40925, "sneaks</w>": 40926, "rudol": 40927, "gables</w>": 40928, "jacking</w>": 40929, "cincode": 40930, "fortune": 40931, "canadiens</w>": 40932, "confor": 40933, "abnormal</w>": 40934, "franklin": 40935, "tita</w>": 40936, "mula</w>": 40937, "persist</w>": 40938, "cuties": 40939, "kiel": 40940, "ðŁĩ±ðŁĩ": 40941, "hermann</w>": 40942, "awk</w>": 40943, "fiasco</w>": 40944, "koto": 40945, "weta</w>": 40946, "hiker</w>": 40947, "buddy": 40948, "preventive</w>": 40949, "mcgraw</w>": 40950, "gameboy</w>": 40951, "forsyth</w>": 40952, "topshop</w>": 40953, "siob": 40954, "sadh": 40955, "intram": 40956, "followart</w>": 40957, "soaps</w>": 40958, "dragonball": 40959, "oux": 40960, "morrison": 40961, "à¹ĥ": 40962, "lubric": 40963, "adulthood</w>": 40964, "morrisons</w>": 40965, "âļłï¸ı": 40966, "hermo": 40967, "taka</w>": 40968, "stallone</w>": 40969, "misuse</w>": 40970, "teamgb</w>": 40971, "ragha": 40972, "confined</w>": 40973, "aty": 40974, "homophobic</w>": 40975, "nwo</w>": 40976, "skynews</w>": 40977, "hoya</w>": 40978, "acrosse</w>": 40979, "wiiu</w>": 40980, "purÃ©e</w>": 40981, "jeddah</w>": 40982, "ðŁ¤§</w>": 40983, "advisers</w>": 40984, "phine</w>": 40985, "anis": 40986, "scrumptious</w>": 40987, "ë°ķ": 40988, "cke</w>": 40989, "viny": 40990, "term": 40991, "sdc</w>": 40992, "odo</w>": 40993, "homeschool</w>": 40994, "vasc</w>": 40995, "leopards</w>": 40996, "deborah": 40997, "illicit</w>": 40998, "curran</w>": 40999, "asroma</w>": 41000, "naught</w>": 41001, "marig": 41002, "brandi</w>": 41003, "emp</w>": 41004, "ðŁĺįðŁĳĮ</w>": 41005, "îĮ": 41006, "suspend</w>": 41007, "luz</w>": 41008, "initiation</w>": 41009, "schaft</w>": 41010, "jensenackles</w>": 41011, "crawler</w>": 41012, "postdoc</w>": 41013, "desks</w>": 41014, "trailblazer</w>": 41015, "denomin": 41016, "trix</w>": 41017, "noise": 41018, "poet": 41019, "±ï¸ı</w>": 41020, "smug</w>": 41021, "volatile</w>": 41022, "proofs</w>": 41023, "pharmacist</w>": 41024, "sardinia</w>": 41025, "mashable</w>": 41026, "kimchi</w>": 41027, "coed</w>": 41028, "schalke</w>": 41029, "doodled</w>": 41030, "csw</w>": 41031, "shur": 41032, "rox</w>": 41033, "dok</w>": 41034, "chrisbrown</w>": 41035, "mathematician</w>": 41036, "abound</w>": 41037, "angelic</w>": 41038, "rockford</w>": 41039, "dole</w>": 41040, "yorkers</w>": 41041, "msn</w>": 41042, "gman": 41043, "xavier": 41044, "borrowing</w>": 41045, "markings</w>": 41046, "longhorn</w>": 41047, "kja": 41048, "diverted</w>": 41049, "mmit</w>": 41050, "euphoria</w>": 41051, "ayyy</w>": 41052, "tea": 41053, "pah": 41054, "cki</w>": 41055, "uncut</w>": 41056, "liven": 41057, "kyung</w>": 41058, "fanart": 41059, "mering</w>": 41060, "redding</w>": 41061, "amovie</w>": 41062, "gridi": 41063, "cthulhu</w>": 41064, "scholarly</w>": 41065, "judah</w>": 41066, "thbewithyou</w>": 41067, "eucalyp": 41068, "ðŁĲķ</w>": 41069, "hertfordshire</w>": 41070, "courtroom</w>": 41071, "byu": 41072, "auctioned</w>": 41073, "please": 41074, "marcia</w>": 41075, "ê°ĵ": 41076, "succeeded</w>": 41077, "elas</w>": 41078, "arvind</w>": 41079, "tlot</w>": 41080, "saigon</w>": 41081, "rett": 41082, "rakesh</w>": 41083, "fdny</w>": 41084, "asen": 41085, "sebring</w>": 41086, "gladiators</w>": 41087, "youknow</w>": 41088, "vlad</w>": 41089, "gola</w>": 41090, "parap": 41091, "ÑĢÐ¸": 41092, "sabcnews</w>": 41093, "oneteam</w>": 41094, "ohl</w>": 41095, "sune</w>": 41096, "rij": 41097, "cdc": 41098, "stargate</w>": 41099, "rundown</w>": 41100, "plato</w>": 41101, "phc</w>": 41102, "chatter</w>": 41103, "raviol": 41104, "mnf</w>": 41105, "mandala</w>": 41106, "liet</w>": 41107, "à¸ķ</w>": 41108, "maria": 41109, "hungover</w>": 41110, "consolidation</w>": 41111, "ferrell</w>": 41112, "traditional": 41113, "iloveart</w>": 41114, "galap": 41115, "ðŁıĮ": 41116, "quezon</w>": 41117, "espaÃ±a</w>": 41118, "ðŁĩ¨ðŁĩŃ</w>": 41119, "hobby": 41120, "steamboat</w>": 41121, "malign": 41122, "guillau": 41123, "prohi": 41124, "itsme": 41125, "íĥĢ": 41126, "inscription</w>": 41127, "alz</w>": 41128, "marian": 41129, "kade</w>": 41130, "mmon</w>": 41131, "adjusting</w>": 41132, "nests</w>": 41133, "internally</w>": 41134, "cir</w>": 41135, "vikram": 41136, "malala</w>": 41137, "kph</w>": 41138, "felicia</w>": 41139, "thereal</w>": 41140, "captivity</w>": 41141, "atis</w>": 41142, "marcorubio</w>": 41143, "kaleido": 41144, "chev</w>": 41145, "manoj</w>": 41146, "lemore</w>": 41147, "gentri": 41148, "vips</w>": 41149, "trope</w>": 41150, "\"âĢĶ</w>": 41151, "pairings</w>": 41152, "malnutrition</w>": 41153, "fray</w>": 41154, "designation</w>": 41155, "brunomars</w>": 41156, "aze": 41157, "torrential</w>": 41158, "panzer</w>": 41159, "gail": 41160, "underthe": 41161, "theological</w>": 41162, "schizophre": 41163, "dazzle</w>": 41164, "frederic</w>": 41165, "mopar</w>": 41166, "adilla</w>": 41167, "soggy</w>": 41168, "raun": 41169, "mediocre</w>": 41170, "colorec": 41171, "ife": 41172, "pinst": 41173, "bluef": 41174, "Â²</w>": 41175, "worldwater": 41176, "giroud</w>": 41177, "clarinet</w>": 41178, "adolf</w>": 41179, "tarantino</w>": 41180, "receipts</w>": 41181, "assump": 41182, "ðŁĳŁ</w>": 41183, "coffees</w>": 41184, "âľĬðŁı¾</w>": 41185, "duplex</w>": 41186, "sof</w>": 41187, "rx": 41188, "lino": 41189, "timberwolves</w>": 41190, "pandit</w>": 41191, "motm</w>": 41192, "ega</w>": 41193, "ayama</w>": 41194, "achs</w>": 41195, "outsider</w>": 41196, "llen": 41197, "coer": 41198, "tilly</w>": 41199, "cheeseburger</w>": 41200, "mads</w>": 41201, "pledis</w>": 41202, "empty": 41203, "nationalparks</w>": 41204, "aziz": 41205, "pmi</w>": 41206, "junkies</w>": 41207, "fener": 41208, "sqn</w>": 41209, "Ã¨s</w>": 41210, "generation": 41211, "cleopatra</w>": 41212, "bhubanes": 41213, "mosques</w>": 41214, "tyfree</w>": 41215, "poppins</w>": 41216, "twc</w>": 41217, "orwell</w>": 41218, "nage</w>": 41219, "kawhi</w>": 41220, "hollow": 41221, "dalai</w>": 41222, "Â¨Â¨Â¨Â¨": 41223, "ouro": 41224, "mhealth</w>": 41225, "gion</w>": 41226, "azo</w>": 41227, "visas</w>": 41228, "renegade</w>": 41229, "reic": 41230, "wsop</w>": 41231, "ðŁĴļðŁĴĽ</w>": 41232, "echel": 41233, "toxicity</w>": 41234, "mÃ¼n": 41235, "bunk</w>": 41236, "stimulating</w>": 41237, "asthour</w>": 41238, "\\'</w>": 41239, "eph</w>": 41240, "endemic</w>": 41241, "cnbc": 41242, "shrinking</w>": 41243, "peabody</w>": 41244, "michelangelo</w>": 41245, "canyon": 41246, "wale": 41247, "sumi</w>": 41248, "siders</w>": 41249, "inuit</w>": 41250, "?.</w>": 41251, "professionalism</w>": 41252, "dracing</w>": 41253, "platoon</w>": 41254, "pons</w>": 41255, "outbound</w>": 41256, "mapleleafs</w>": 41257, "desol": 41258, "cency</w>": 41259, "athan": 41260, "verma</w>": 41261, "rubbing</w>": 41262, "okan": 41263, "ðŁĳł</w>": 41264, "mullins</w>": 41265, "authentic": 41266, "Åį": 41267, "almanac</w>": 41268, "gaia</w>": 41269, "bbq": 41270, "onimo</w>": 41271, "keh": 41272, "tya</w>": 41273, "touts</w>": 41274, "yav": 41275, "reposit": 41276, ",.</w>": 41277, "wight": 41278, "seeyou": 41279, "callof": 41280, "donesia</w>": 41281, "bargaining</w>": 41282, "granth": 41283, "sdsu</w>": 41284, "amphitheater</w>": 41285, "psu": 41286, "rewatching</w>": 41287, "winetasting</w>": 41288, "peakdistrict</w>": 41289, "detecting</w>": 41290, "thurman</w>": 41291, "phee</w>": 41292, "èªķ": 41293, "umich": 41294, "rer": 41295, "sculpted</w>": 41296, "gole": 41297, "namesake</w>": 41298, "ðŁĶģ</w>": 41299, "servicing</w>": 41300, "baugh</w>": 41301, "pugh</w>": 41302, "pencil": 41303, "darth": 41304, "munchkin</w>": 41305, "atorium</w>": 41306, "teners</w>": 41307, "suny</w>": 41308, "rollingstones</w>": 41309, "maging</w>": 41310, "starrer</w>": 41311, "idris</w>": 41312, "feinstein</w>": 41313, "agron": 41314, "âĺºï¸ıâĺºï¸ı</w>": 41315, "supervised</w>": 41316, "chameleon</w>": 41317, "aggregate</w>": 41318, "successive</w>": 41319, "mogul</w>": 41320, "instyle</w>": 41321, "poldark</w>": 41322, "custome": 41323, "ohiostate</w>": 41324, "haya</w>": 41325, "cides</w>": 41326, "brokerage</w>": 41327, "angelou</w>": 41328, "fifawwc</w>": 41329, "deforestation</w>": 41330, "alton": 41331, "pamph": 41332, "hugged</w>": 41333, "hobo</w>": 41334, "changeable</w>": 41335, "kuber": 41336, "burroughs</w>": 41337, "demonetisation</w>": 41338, "capecod</w>": 41339, "versatility</w>": 41340, "orice</w>": 41341, "leila</w>": 41342, "womeninscience</w>": 41343, "tua</w>": 41344, "hedges</w>": 41345, "embarrassment</w>": 41346, "alife": 41347, "soars</w>": 41348, "nighter</w>": 41349, "hymn</w>": 41350, "gipp": 41351, "chasu": 41352, "techs</w>": 41353, "niall": 41354, "killa</w>": 41355, "hika</w>": 41356, "camels</w>": 41357, "value": 41358, "Â¢</w>": 41359, "scoops</w>": 41360, "mahmoud</w>": 41361, "clusive</w>": 41362, "adriana</w>": 41363, "paco</w>": 41364, "ozil</w>": 41365, "unas</w>": 41366, "translations</w>": 41367, "whisperer</w>": 41368, "sbi</w>": 41369, "buxton</w>": 41370, "biotics</w>": 41371, "indiffe": 41372, "kenney</w>": 41373, "klar": 41374, "etching</w>": 41375, "barrabest</w>": 41376, "instability</w>": 41377, "seine</w>": 41378, "votel": 41379, "blogged</w>": 41380, "whiskey": 41381, "myspace</w>": 41382, "tant": 41383, "landia</w>": 41384, "giveback</w>": 41385, "illus</w>": 41386, "awak</w>": 41387, "acab": 41388, "fbloggers</w>": 41389, "cloudcomputing</w>": 41390, "blatant</w>": 41391, "syrians</w>": 41392, "bandra</w>": 41393, "styn</w>": 41394, "anem": 41395, "keted</w>": 41396, "karthik</w>": 41397, "barunsob": 41398, "pinot": 41399, "gubernat": 41400, "gaye</w>": 41401, "artiste</w>": 41402, "ified</w>": 41403, "conventions</w>": 41404, "huan</w>": 41405, "geniuses</w>": 41406, "eeeeee</w>": 41407, "folly</w>": 41408, "somerville</w>": 41409, "pridemonth</w>": 41410, "ðŁĩºðŁĩ¸ðŁĩºðŁĩ¸</w>": 41411, "chemotherapy</w>": 41412, "pauls": 41413, "bakar</w>": 41414, "ìĦ¸ë¸Ĳ</w>": 41415, "taiwanese</w>": 41416, "follo": 41417, "css": 41418, "reign": 41419, "nnnn</w>": 41420, "flaun": 41421, "catastrophe</w>": 41422, "ities": 41423, "fragments</w>": 41424, "extremists</w>": 41425, "ymoun": 41426, "carmen": 41427, "ezekiel</w>": 41428, "connecting</w>": 41429, "seh</w>": 41430, "manta</w>": 41431, "remodeling</w>": 41432, "weymouth</w>": 41433, "atoms</w>": 41434, "cem</w>": 41435, "newell</w>": 41436, "lumi": 41437, "theopen</w>": 41438, "moc": 41439, "miliband</w>": 41440, "gland</w>": 41441, "zshq</w>": 41442, "maggie": 41443, "maniacs</w>": 41444, "msp": 41445, "ady": 41446, "creams</w>": 41447, "leanne</w>": 41448, "esta": 41449, "pyg": 41450, "affinity</w>": 41451, "prayer": 41452, "dunbar</w>": 41453, "lightroom</w>": 41454, "acadi": 41455, "wynonna": 41456, "romantic": 41457, "statedept</w>": 41458, "sickle</w>": 41459, "whos": 41460, "lamo": 41461, "etour</w>": 41462, "finity": 41463, "shrub</w>": 41464, "sharpen": 41465, "pundit</w>": 41466, "edon</w>": 41467, "afore": 41468, "mars": 41469, "jeffery</w>": 41470, "terps</w>": 41471, "medallist</w>": 41472, "katharine</w>": 41473, "accusing</w>": 41474, "taz": 41475, "royd</w>": 41476, "fromhome</w>": 41477, "confrontation</w>": 41478, "allegh": 41479, "ðŁĳīðŁĳī</w>": 41480, "refresher</w>": 41481, "ranveer</w>": 41482, "neverland</w>": 41483, "jojo": 41484, "lucrative</w>": 41485, "enam": 41486, "caver": 41487, "paedi": 41488, "manjaro</w>": 41489, "fluids</w>": 41490, "thessal": 41491, "oppressed</w>": 41492, "muss": 41493, "johanna</w>": 41494, "Ø®": 41495, "cng</w>": 41496, "buildthe": 41497, "settles</w>": 41498, "sith</w>": 41499, "fuego</w>": 41500, "clamp</w>": 41501, "arag": 41502, "payer</w>": 41503, "tedx</w>": 41504, "mandy": 41505, "interstellar</w>": 41506, "frc</w>": 41507, "chand</w>": 41508, "bcc</w>": 41509, "molo": 41510, "lentil</w>": 41511, "johansson</w>": 41512, "grimsby</w>": 41513, "naturelovers</w>": 41514, "ðŁļ¨ðŁļ¨ðŁļ¨</w>": 41515, "shinde</w>": 41516, "xin</w>": 41517, "internationaldayof": 41518, "transitional</w>": 41519, "sata</w>": 41520, "caddy</w>": 41521, "wod</w>": 41522, "ifu</w>": 41523, "hays</w>": 41524, "hollyo": 41525, "jang": 41526, "irc</w>": 41527, "coim": 41528, "gradable</w>": 41529, "\"\"": 41530, "ðŁį´": 41531, "à¦¾</w>": 41532, "ael": 41533, "nyo": 41534, "westlake</w>": 41535, "timeout</w>": 41536, "sofi": 41537, "phenomena</w>": 41538, "cultivation</w>": 41539, "agno": 41540, "unarmed</w>": 41541, "sot": 41542, "conj": 41543, "geno": 41544, "royalnavy</w>": 41545, "nutrition": 41546, "fairmont</w>": 41547, "tirelessly</w>": 41548, "sng</w>": 41549, "rety</w>": 41550, "mica</w>": 41551, "lucent</w>": 41552, "sloane</w>": 41553, "drool</w>": 41554, "rizal</w>": 41555, "odell</w>": 41556, "criticized</w>": 41557, ".'\"</w>": 41558, "laze</w>": 41559, "deserted</w>": 41560, "coder</w>": 41561, "pras</w>": 41562, "lillian</w>": 41563, "itinerary</w>": 41564, "davy</w>": 41565, "anap": 41566, "whipping</w>": 41567, "hoboken</w>": 41568, "kareena</w>": 41569, "çľŁ": 41570, "vius</w>": 41571, "tern": 41572, "nantucket</w>": 41573, "misunderstood</w>": 41574, "bulaga</w>": 41575, "stant": 41576, "chinook</w>": 41577, "zam</w>": 41578, "relies</w>": 41579, "dss</w>": 41580, "edmond</w>": 41581, "sketchy</w>": 41582, "mell</w>": 41583, "fex": 41584, "rector</w>": 41585, "distill": 41586, "daydream</w>": 41587, "winemaker</w>": 41588, "ripley</w>": 41589, "billionaires</w>": 41590, "helene</w>": 41591, "atif</w>": 41592, "culprit</w>": 41593, "bertrand</w>": 41594, "wouldnt</w>": 41595, "mapped</w>": 41596, "vak</w>": 41597, "gladly</w>": 41598, "parliament": 41599, "kidlitart</w>": 41600, "wareness": 41601, "goliath</w>": 41602, "âĨĵ</w>": 41603, "viewpoint</w>": 41604, "tatted</w>": 41605, "fuls</w>": 41606, "dorsey</w>": 41607, "anglers</w>": 41608, "lids</w>": 41609, "kiya</w>": 41610, "bowles</w>": 41611, "beh</w>": 41612, "bite</w>": 41613, "compatibility</w>": 41614, "ancestral</w>": 41615, "prox": 41616, "behaved</w>": 41617, "gubernatorial</w>": 41618, "chfield</w>": 41619, "saban</w>": 41620, "zh</w>": 41621, "teeny</w>": 41622, "shibuya</w>": 41623, "holliday</w>": 41624, "pancy</w>": 41625, "âĿĦï¸ıâĿĦï¸ı": 41626, "seungri</w>": 41627, "?,</w>": 41628, "ðŁĩ¦ðŁĩ·</w>": 41629, "imitation</w>": 41630, "impactful</w>": 41631, "anyi</w>": 41632, "genevie": 41633, "aÃ±os</w>": 41634, "bateman</w>": 41635, "glider</w>": 41636, "afar": 41637, "rasheed</w>": 41638, "effortless</w>": 41639, "shwar</w>": 41640, "dachsh": 41641, "erun</w>": 41642, "atos</w>": 41643, "kini</w>": 41644, "chd</w>": 41645, "khaki</w>": 41646, "klin</w>": 41647, "felicidades</w>": 41648, "belo</w>": 41649, "asl</w>": 41650, "toppers</w>": 41651, "finley</w>": 41652, "stacey": 41653, "rigorous</w>": 41654, "karting</w>": 41655, "leppard</w>": 41656, "carmichael</w>": 41657, "beret</w>": 41658, "cse</w>": 41659, "akhi": 41660, "meringue</w>": 41661, "aban": 41662, "hake": 41663, "geri": 41664, "erjee</w>": 41665, "resto</w>": 41666, "commanders</w>": 41667, "prit": 41668, "flor</w>": 41669, "adven": 41670, "extermin": 41671, "remainder</w>": 41672, "åĲ": 41673, "esg</w>": 41674, "martino</w>": 41675, "lullaby</w>": 41676, "|@</w>": 41677, "mign": 41678, "instore</w>": 41679, "bigbang": 41680, "cordi": 41681, "cauley</w>": 41682, "antebellum</w>": 41683, "dgate</w>": 41684, "crock": 41685, "spandex</w>": 41686, "scaffolding</w>": 41687, "oreos</w>": 41688, "ê°ĵìĦ¸ë¸Ĳ</w>": 41689, "pomona</w>": 41690, "mauro</w>": 41691, "universi": 41692, "remi</w>": 41693, "afootball</w>": 41694, "tant</w>": 41695, "smalls</w>": 41696, "neh</w>": 41697, "worldo": 41698, "tropical": 41699, "morph</w>": 41700, "javelin</w>": 41701, "glar</w>": 41702, "arquitec": 41703, "reminiscent</w>": 41704, "tubs</w>": 41705, "spidey</w>": 41706, "makeu": 41707, "sylla": 41708, "progressives</w>": 41709, "blot</w>": 41710, "shorten</w>": 41711, "keepin</w>": 41712, "chak</w>": 41713, "angst</w>": 41714, "superfood</w>": 41715, "decadent</w>": 41716, "stony": 41717, "neurological</w>": 41718, "arboretum</w>": 41719, "annak": 41720, "fema</w>": 41721, "percu": 41722, "disrespectful</w>": 41723, "smallbiz": 41724, "lox</w>": 41725, "coom": 41726, "csc": 41727, "bsbi": 41728, "prevalence</w>": 41729, "himss</w>": 41730, "espan": 41731, "moga": 41732, "frampton</w>": 41733, "skymap</w>": 41734, "masse": 41735, "leviathan</w>": 41736, "().</w>": 41737, "nocturnal</w>": 41738, "carameli": 41739, "angor</w>": 41740, "amnesia</w>": 41741, "outsiders</w>": 41742, "shealth": 41743, "rhino": 41744, "antag": 41745, "agio</w>": 41746, "ðŁĴ°ðŁĴ°": 41747, "takeme": 41748, "kabaddi</w>": 41749, "csi": 41750, "msh": 41751, "cochrane</w>": 41752, "thessaloni": 41753, "sila</w>": 41754, "haus": 41755, "dusting</w>": 41756, "obese</w>": 41757, "macklemore</w>": 41758, "manish": 41759, "lenin</w>": 41760, "mdc</w>": 41761, "grown": 41762, "sheffield": 41763, "srs</w>": 41764, "kele": 41765, "carson": 41766, "chum</w>": 41767, "dahlia</w>": 41768, "cantore</w>": 41769, "oppo</w>": 41770, "howling</w>": 41771, "cybercrime</w>": 41772, "surrealism</w>": 41773, "scran": 41774, "faiz": 41775, "thren</w>": 41776, "racists</w>": 41777, "rout</w>": 41778, "pknot</w>": 41779, "semana</w>": 41780, "sini": 41781, "mccull": 41782, "machi": 41783, "alfonso</w>": 41784, "yb": 41785, "sardar</w>": 41786, "kendrick": 41787, "deng</w>": 41788, "recipro": 41789, "onf</w>": 41790, "doomsday</w>": 41791, "bribery</w>": 41792, "customiz": 41793, "artis</w>": 41794, "cpi</w>": 41795, "ðŁĻĪðŁĻĪ</w>": 41796, "slava</w>": 41797, "lette": 41798, "ens": 41799, "âĿ¤ï¸ıðŁĺĺ</w>": 41800, "crayon</w>": 41801, "adan</w>": 41802, "trc</w>": 41803, "migrate</w>": 41804, "simpson": 41805, "rowers</w>": 41806, "kingsley</w>": 41807, "farmersmarket</w>": 41808, "sheehan</w>": 41809, "nephe": 41810, "bornon": 41811, "carton</w>": 41812, "mickey": 41813, "allure</w>": 41814, "ulu": 41815, "slipknot</w>": 41816, "hebdo</w>": 41817, "guido</w>": 41818, "dogcelebration</w>": 41819, "onlinemarketing</w>": 41820, "accelerating</w>": 41821, ")..</w>": 41822, "originated</w>": 41823, "macaroni</w>": 41824, "edtech": 41825, "outfield</w>": 41826, "mitz": 41827, "discus</w>": 41828, "advertiser</w>": 41829, "manor": 41830, "hashi</w>": 41831, "descrip": 41832, "capita</w>": 41833, "fulbright</w>": 41834, "receptor</w>": 41835, "conn": 41836, "coney</w>": 41837, "spionage</w>": 41838, "rattle</w>": 41839, "prest": 41840, "uli": 41841, "blogpost</w>": 41842, "ackeray</w>": 41843, ")âĢ¦</w>": 41844, "redvelvet</w>": 41845, "matth": 41846, "inspiring": 41847, "bsd</w>": 41848, "kerri": 41849, "pocon": 41850, "millar</w>": 41851, "repur": 41852, "accenture</w>": 41853, "ä¹": 41854, "rambo</w>": 41855, "ragnarok</w>": 41856, "deleting</w>": 41857, "britishmuseum</w>": 41858, "patory</w>": 41859, "leipzig</w>": 41860, "florian</w>": 41861, "scifi": 41862, "iners</w>": 41863, "brate</w>": 41864, "yoy</w>": 41865, "melissa": 41866, "aber</w>": 41867, "masa</w>": 41868, "pote</w>": 41869, "mosquitoes</w>": 41870, "transplant": 41871, "rpa</w>": 41872, ";))</w>": 41873, "bastille</w>": 41874, "ylan</w>": 41875, "joyeux</w>": 41876, "melodic</w>": 41877, "captions</w>": 41878, "atrist</w>": 41879, "rochdale</w>": 41880, "gotti</w>": 41881, "pewdie": 41882, "cutiesaturday</w>": 41883, "whois": 41884, "aquaculture</w>": 41885, "tiva</w>": 41886, "spel": 41887, "hess</w>": 41888, "haji</w>": 41889, "freddie": 41890, "coper": 41891, "brando</w>": 41892, "vk</w>": 41893, "photobook</w>": 41894, "*,</w>": 41895, "mydayin": 41896, "michaela</w>": 41897, "brunei</w>": 41898, "srini": 41899, "inte</w>": 41900, "Ä±</w>": 41901, "deol</w>": 41902, "dfc</w>": 41903, "separately</w>": 41904, "bund</w>": 41905, "vests</w>": 41906, "toc": 41907, "meck": 41908, "reinforced</w>": 41909, "constraints</w>": 41910, "carroll": 41911, "sqft</w>": 41912, "rever</w>": 41913, "camper": 41914, "birdman</w>": 41915, "inaction</w>": 41916, "generators</w>": 41917, "triumphant</w>": 41918, "pests</w>": 41919, "ovo": 41920, "gypt</w>": 41921, "alamo": 41922, "scaled</w>": 41923, "sureshpp": 41924, "sdn</w>": 41925, "ismo</w>": 41926, "gios</w>": 41927, ")@</w>": 41928, "justiceleague</w>": 41929, "restaurant": 41930, "gabi</w>": 41931, "dengue</w>": 41932, "nextgen</w>": 41933, "exempli": 41934, "apex": 41935, "inspirational": 41936, "downside</w>": 41937, "kidz</w>": 41938, "upl": 41939, "etna</w>": 41940, "alvaro</w>": 41941, "feldman</w>": 41942, "barnet</w>": 41943, "mha</w>": 41944, "esch</w>": 41945, "blooded</w>": 41946, ">>>>>>>>": 41947, "kani</w>": 41948, "hofficial</w>": 41949, "casablanca</w>": 41950, "birds": 41951, "tyga</w>": 41952, "swamp": 41953, "oday</w>": 41954, "newcastle": 41955, "nbap": 41956, "cision</w>": 41957, "chools</w>": 41958, "aflo": 41959, "nep</w>": 41960, "monton</w>": 41961, "akb</w>": 41962, "supermodel</w>": 41963, "downtime</w>": 41964, "thos</w>": 41965, "scwx</w>": 41966, "snoopy</w>": 41967, "aggreg": 41968, "yoke</w>": 41969, "norcal</w>": 41970, "wett</w>": 41971, "prolonged</w>": 41972, "metast": 41973, "beater</w>": 41974, "fta</w>": 41975, "tlap</w>": 41976, "disgusted</w>": 41977, "yh</w>": 41978, "voiceover</w>": 41979, "itchy</w>": 41980, "ipc</w>": 41981, "ðŁİ¾": 41982, "pheasant</w>": 41983, "straits</w>": 41984, "rampant</w>": 41985, "jg": 41986, "fertil": 41987, "assures</w>": 41988, "fortunes</w>": 41989, "salinas</w>": 41990, "lizards</w>": 41991, "kettle": 41992, "ibs</w>": 41993, "cynthi": 41994, "heg": 41995, "mccr": 41996, "socceroos</w>": 41997, "happenings</w>": 41998, "corden</w>": 41999, "ðŁĺĤðŁĳĮ</w>": 42000, "tches</w>": 42001, "egret</w>": 42002, "wolverines</w>": 42003, "congratulated</w>": 42004, "hogg</w>": 42005, "bottling</w>": 42006, "wri</w>": 42007, "ferri": 42008, "bosch": 42009, "afire</w>": 42010, "ogden</w>": 42011, "sjo": 42012, "jdm</w>": 42013, "svt</w>": 42014, "contex": 42015, "tollywood</w>": 42016, "mink</w>": 42017, "mese</w>": 42018, "supersonic</w>": 42019, "opoulos</w>": 42020, "å¸": 42021, "âĶģ": 42022, "knuckle</w>": 42023, "guise</w>": 42024, "gami</w>": 42025, "chucky</w>": 42026, "zinger</w>": 42027, "radial</w>": 42028, "complained</w>": 42029, "boda</w>": 42030, "fetal</w>": 42031, "disciplines</w>": 42032, "corro</w>": 42033, "ðŁĩ®ðŁĩ¹": 42034, "opted</w>": 42035, "filtration</w>": 42036, "adnan</w>": 42037, "emcee</w>": 42038, "mistre": 42039, "insomni": 42040, "fergus</w>": 42041, "trajec": 42042, "ondon": 42043, "medtech</w>": 42044, "tangerine</w>": 42045, "madras</w>": 42046, "grue": 42047, "cabs</w>": 42048, "zhu": 42049, "sureshpprabhu</w>": 42050, "insulated</w>": 42051, "dayswild</w>": 42052, "ppm</w>": 42053, "bandai</w>": 42054, "vday</w>": 42055, "sff</w>": 42056, "squid": 42057, "lothing</w>": 42058, "notdead</w>": 42059, "expressive</w>": 42060, "cull</w>": 42061, "alastair</w>": 42062, "xu": 42063, "upfront</w>": 42064, "fishers</w>": 42065, "enes</w>": 42066, "umd</w>": 42067, "dismissal</w>": 42068, "stier</w>": 42069, "sels</w>": 42070, "lust": 42071, "reactive</w>": 42072, "protester</w>": 42073, "eyelashes</w>": 42074, "alim": 42075, "goode</w>": 42076, "greeng": 42077, "dair</w>": 42078, "compen": 42079, "anushka": 42080, "prototyping</w>": 42081, "mapu": 42082, "bearings</w>": 42083, "ðŁĲŁ": 42084, "forme</w>": 42085, "bsbibotany</w>": 42086, "timothy": 42087, "outskirts</w>": 42088, "ambed": 42089, "aretha</w>": 42090, "wendell</w>": 42091, "streaks</w>": 42092, "nim</w>": 42093, "kpk</w>": 42094, "snee": 42095, "fitter</w>": 42096, "quota</w>": 42097, "pate</w>": 42098, "winning": 42099, "ðŁįŃ</w>": 42100, "shopping": 42101, "mainst": 42102, "culver</w>": 42103, "stevie": 42104, "mcfadden</w>": 42105, "counterparts</w>": 42106, "grenfell</w>": 42107, "folsom</w>": 42108, "dorset": 42109, "techcrunch</w>": 42110, "â¬ħï¸ı</w>": 42111, "tiptuesday</w>": 42112, "usl</w>": 42113, "trex</w>": 42114, "georgie</w>": 42115, "ranveerofficial</w>": 42116, "licks</w>": 42117, "sewn</w>": 42118, "kf</w>": 42119, "'âĢ¦</w>": 42120, "japs</w>": 42121, "pate": 42122, "orthop": 42123, "festa</w>": 42124, "stras</w>": 42125, "montal": 42126, "hammersmith</w>": 42127, "foremost</w>": 42128, "widows</w>": 42129, "madre</w>": 42130, "itez</w>": 42131, "mitochondri": 42132, "ligans</w>": 42133, "zona</w>": 42134, "caribou</w>": 42135, "mss": 42136, "andrei</w>": 42137, "weatherchannel</w>": 42138, "ghc</w>": 42139, ":...</w>": 42140, "taft</w>": 42141, "aweather": 42142, "alisation</w>": 42143, "brutal": 42144, "blissful</w>": 42145, "nikola</w>": 42146, "malicious</w>": 42147, "qm</w>": 42148, "mpgvip</w>": 42149, "brodie</w>": 42150, "blitz": 42151, "applaud</w>": 42152, "dribb": 42153, "vague</w>": 42154, "doggo</w>": 42155, "translating</w>": 42156, "interpreted</w>": 42157, "hatched</w>": 42158, "getyour": 42159, "beneficiaries</w>": 42160, "sparring</w>": 42161, "caesars</w>": 42162, "awilliams</w>": 42163, "lahat</w>": 42164, "broke": 42165, "timp": 42166, "virtues</w>": 42167, "relying</w>": 42168, "pietro</w>": 42169, "ktn": 42170, "icists</w>": 42171, "pablo": 42172, "loui": 42173, "aag": 42174, "pnpp": 42175, "chast": 42176, "pulses</w>": 42177, "finish": 42178, "usairforce</w>": 42179, "typewriter</w>": 42180, "thompson": 42181, "dogs": 42182, "utto</w>": 42183, "ãģį": 42184, "sandal</w>": 42185, "newly": 42186, "doge</w>": 42187, "zw</w>": 42188, "wankers</w>": 42189, "negr": 42190, "mucha</w>": 42191, "determines</w>": 42192, "blackfish</w>": 42193, "skunk</w>": 42194, "mups</w>": 42195, "instrument": 42196, "phyto": 42197, "daystogo</w>": 42198, "skinned</w>": 42199, "haider</w>": 42200, "conten": 42201, "ðŁĲ¾ðŁĲ¾</w>": 42202, "weiler</w>": 42203, "undoubtedly</w>": 42204, "chairing</w>": 42205, "wallis</w>": 42206, "shard</w>": 42207, "zindabad</w>": 42208, "adult": 42209, "absorption</w>": 42210, "presto</w>": 42211, "deploying</w>": 42212, "drummond</w>": 42213, "battlefront</w>": 42214, "seagulls</w>": 42215, "howdy</w>": 42216, "judaism</w>": 42217, "desde</w>": 42218, "partition</w>": 42219, "âľĿ": 42220, "nology</w>": 42221, "nationalbestfriend": 42222, "lesnar</w>": 42223, "filmfare</w>": 42224, "coasts</w>": 42225, "christensen</w>": 42226, "acan": 42227, "mbu</w>": 42228, "copped</w>": 42229, "rubble</w>": 42230, "swc</w>": 42231, "funnier</w>": 42232, "farther</w>": 42233, "whereas</w>": 42234, "nanotechnology</w>": 42235, "withstand</w>": 42236, "pillow": 42237, "bowers</w>": 42238, "tope</w>": 42239, "itly</w>": 42240, "confit</w>": 42241, "makar": 42242, "comforts</w>": 42243, "bosh</w>": 42244, "clipper</w>": 42245, "balla": 42246, "stik</w>": 42247, "milb</w>": 42248, "safeguard</w>": 42249, "musique</w>": 42250, "easport": 42251, "yaz</w>": 42252, "padded</w>": 42253, "bader</w>": 42254, "foreign": 42255, "chopin</w>": 42256, "archive": 42257, "oka": 42258, "transporting</w>": 42259, "tmltalk</w>": 42260, "ajit</w>": 42261, "consequence</w>": 42262, "scroo": 42263, "ffo</w>": 42264, "collaborated</w>": 42265, "pugchat</w>": 42266, "yemi</w>": 42267, "javed</w>": 42268, "auburn": 42269, "oof</w>": 42270, "maw</w>": 42271, "saucer</w>": 42272, "mitigate</w>": 42273, "iles</w>": 42274, "evangelist</w>": 42275, "terie</w>": 42276, "recl": 42277, "indictment</w>": 42278, "cata</w>": 42279, "brightness</w>": 42280, "maythe</w>": 42281, "whimsical</w>": 42282, "unlv</w>": 42283, "keyword</w>": 42284, "cumin</w>": 42285, "medway</w>": 42286, "westworld</w>": 42287, "traw": 42288, "imposing</w>": 42289, "formity</w>": 42290, "coulter</w>": 42291, "abz</w>": 42292, "nypd": 42293, "grassi</w>": 42294, "kelsey": 42295, "qldpol</w>": 42296, "clockwork</w>": 42297, "fdr</w>": 42298, "dianne</w>": 42299, "âĺĳ</w>": 42300, "adh</w>": 42301, "pann": 42302, "bravely</w>": 42303, "aege": 42304, "unlawful</w>": 42305, "verdi</w>": 42306, "pocalypse</w>": 42307, "pharo": 42308, "karla</w>": 42309, "resonance</w>": 42310, "mastiff</w>": 42311, "ladak": 42312, "buu": 42313, "mailed</w>": 42314, "hii</w>": 42315, "crawley</w>": 42316, "torrent</w>": 42317, "machado</w>": 42318, "libyan</w>": 42319, "effortlessly</w>": 42320, "falsely</w>": 42321, "qvist</w>": 42322, "keef</w>": 42323, "crafthour</w>": 42324, "cherished</w>": 42325, "valkyrie</w>": 42326, "sari": 42327, "kalamaz": 42328, "behe": 42329, "ðŁĮĻ": 42330, "thim": 42331, "roddy</w>": 42332, "coltrane</w>": 42333, "butchers</w>": 42334, "achim</w>": 42335, "wkend</w>": 42336, "awkward": 42337, "cabrera</w>": 42338, ":))))</w>": 42339, "franc</w>": 42340, "declan</w>": 42341, "condos</w>": 42342, "aja": 42343, "pandoramusic</w>": 42344, "charter": 42345, "phill": 42346, "montrose</w>": 42347, "hatchback</w>": 42348, "handicapp": 42349, "greaves</w>": 42350, "eucalyptus</w>": 42351, "utmost</w>": 42352, "tson": 42353, "burton": 42354, "midwives</w>": 42355, "incur": 42356, "ðŁĺį#</w>": 42357, "mood": 42358, "compressed</w>": 42359, "toma": 42360, "mustang": 42361, "mog</w>": 42362, "asana</w>": 42363, "testic": 42364, "shotel</w>": 42365, "insol": 42366, "corsair</w>": 42367, "nhq</w>": 42368, "benny": 42369, "smma</w>": 42370, "kapur</w>": 42371, "incon": 42372, "jonas": 42373, "energies</w>": 42374, "donal": 42375, "asad</w>": 42376, "sez</w>": 42377, "npa</w>": 42378, "archived</w>": 42379, "stimulate</w>": 42380, "dop": 42381, "hyd</w>": 42382, "grieving</w>": 42383, "ãĥĪ": 42384, "rona</w>": 42385, "whyte</w>": 42386, "treehouse</w>": 42387, "ssell</w>": 42388, "sandro</w>": 42389, "kobo</w>": 42390, "thermost": 42391, "seclu": 42392, "hiya</w>": 42393, "geez</w>": 42394, "mamas</w>": 42395, "priscilla</w>": 42396, "flavoured</w>": 42397, "fass": 42398, "wold</w>": 42399, "makerspace</w>": 42400, "cosplay": 42401, "ptv</w>": 42402, "happyvalentinesday</w>": 42403, "sequoia</w>": 42404, "lovecraft</w>": 42405, "guan</w>": 42406, "dtm</w>": 42407, "cii</w>": 42408, "yokohama</w>": 42409, "posthum": 42410, "req</w>": 42411, "ðŁĶµâļªï¸ı</w>": 42412, "galatasar": 42413, "dolby</w>": 42414, "hamptons</w>": 42415, "disturbance</w>": 42416, "stonehenge</w>": 42417, "okc": 42418, "disrupting</w>": 42419, "monthsary</w>": 42420, "jungle": 42421, "headlights</w>": 42422, "dustin": 42423, "microsof": 42424, "happymothersday</w>": 42425, "koko": 42426, "grazi": 42427, "testo": 42428, "naidu</w>": 42429, "malay</w>": 42430, "arial</w>": 42431, "rumb": 42432, "aboo</w>": 42433, "harman</w>": 42434, "trape": 42435, "spoils</w>": 42436, "jeho": 42437, "godly</w>": 42438, "lockscreen</w>": 42439, "zun": 42440, "pious</w>": 42441, "magento</w>": 42442, "lenders</w>": 42443, "probable</w>": 42444, "corporal</w>": 42445, "mour</w>": 42446, "awal": 42447, "sua</w>": 42448, "callme": 42449, "tonne</w>": 42450, "govin": 42451, "devastation</w>": 42452, "xj</w>": 42453, "gearbox</w>": 42454, "warlock</w>": 42455, "perme": 42456, "itate</w>": 42457, "gazaunderattack</w>": 42458, "duval</w>": 42459, "parasite</w>": 42460, "clemente</w>": 42461, "leth</w>": 42462, "iva</w>": 42463, "frozen": 42464, "tholes</w>": 42465, "tobin</w>": 42466, "cairn</w>": 42467, "sill": 42468, "luckiest</w>": 42469, "converts</w>": 42470, "stale</w>": 42471, "pancra": 42472, "europale": 42473, "wisdom": 42474, "schur": 42475, "ì¶": 42476, "vertigo</w>": 42477, "bij": 42478, "ubc": 42479, "nure": 42480, "righteousness</w>": 42481, "mtc</w>": 42482, "factory": 42483, "verst": 42484, "reversed</w>": 42485, "huri</w>": 42486, "heechul</w>": 42487, "faber</w>": 42488, "arr</w>": 42489, "ulous": 42490, "venom": 42491, "phat</w>": 42492, "greenery</w>": 42493, "brady": 42494, "Ã¦": 42495, ":((</w>": 42496, "nevergiveup</w>": 42497, "disha</w>": 42498, "mota</w>": 42499, "healthcare": 42500, "dunham</w>": 42501, "dexpo</w>": 42502, "denzel</w>": 42503, "bbins</w>": 42504, "fics</w>": 42505, "wham": 42506, "mcg": 42507, "elian</w>": 42508, "wata</w>": 42509, "stralia</w>": 42510, "tellu": 42511, "pesky</w>": 42512, "spinoff</w>": 42513, "armoured</w>": 42514, "reacted</w>": 42515, "dofficial</w>": 42516, "tedu</w>": 42517, "sagar</w>": 42518, "morally</w>": 42519, "paralleled</w>": 42520, "fios</w>": 42521, "downer</w>": 42522, "daugh": 42523, "redo</w>": 42524, "worldcup": 42525, "tariq</w>": 42526, "barne": 42527, "glaciers</w>": 42528, "occult</w>": 42529, "barbarian</w>": 42530, "hermosa</w>": 42531, "!!!)</w>": 42532, "yur": 42533, "internation": 42534, "pss</w>": 42535, "situ</w>": 42536, "pint": 42537, "americanair</w>": 42538, "swam</w>": 42539, "doppler</w>": 42540, "ðŁĴĻðŁĴľ</w>": 42541, "cincodemayo</w>": 42542, "levan": 42543, "hellenic</w>": 42544, "mcne": 42545, "judi": 42546, "yuh</w>": 42547, "stx</w>": 42548, "quare</w>": 42549, "ðŁĺĤ.</w>": 42550, "stig</w>": 42551, "gels</w>": 42552, "motley</w>": 42553, "hardwork": 42554, "eurozone</w>": 42555, "ead": 42556, "ç¥Ń</w>": 42557, "seabir": 42558, "cius</w>": 42559, "laid": 42560, "alpaca</w>": 42561, "presumably</w>": 42562, "pewdiepie</w>": 42563, "booted</w>": 42564, "amari": 42565, "tamine</w>": 42566, "solace</w>": 42567, "barrow": 42568, "academies</w>": 42569, "xian</w>": 42570, "omination</w>": 42571, "dungeons</w>": 42572, "bma</w>": 42573, "deity</w>": 42574, "aik</w>": 42575, "stabil": 42576, "hira</w>": 42577, "affectionate</w>": 42578, "vingne</w>": 42579, "newport": 42580, "ãħĭãħĭ</w>": 42581, "thirds</w>": 42582, "retains</w>": 42583, "aromatherapy</w>": 42584, "skier</w>": 42585, "nima</w>": 42586, "dope": 42587, "cringe</w>": 42588, "condomin": 42589, "toor": 42590, "animator</w>": 42591, "saraj": 42592, "seascape</w>": 42593, "minimalism</w>": 42594, "lakeshore</w>": 42595, "callaway</w>": 42596, "bergman</w>": 42597, "à¤Ĺ</w>": 42598, "whispering</w>": 42599, "stupid": 42600, "rightful</w>": 42601, "requis": 42602, "irn</w>": 42603, "seva</w>": 42604, "utpol</w>": 42605, "tuberculo": 42606, "squish": 42607, "debut": 42608, "governmental</w>": 42609, "christine": 42610, "allman</w>": 42611, "weapon": 42612, "sito</w>": 42613, "buri</w>": 42614, "lolita</w>": 42615, "leafy</w>": 42616, "fuch": 42617, "tinted</w>": 42618, "mcken": 42619, "ahahaha</w>": 42620, "ðŁĩµðŁĩ¹</w>": 42621, "repeal": 42622, "negan</w>": 42623, "ðŁķĬ": 42624, "tailgating</w>": 42625, "gameinsight</w>": 42626, "ðŁıŁï¸ı</w>": 42627, "yakuza</w>": 42628, "zt</w>": 42629, "tiring</w>": 42630, "proposing</w>": 42631, "bowlers</w>": 42632, "traitors</w>": 42633, "akshi</w>": 42634, "clergy</w>": 42635, "cito</w>": 42636, "upsets</w>": 42637, "tuscal": 42638, "symphonic</w>": 42639, "silently</w>": 42640, "shuff": 42641, "blackwell</w>": 42642, "ðŁĺĤ)</w>": 42643, "kobe": 42644, "roberto": 42645, "ridg": 42646, "dcu</w>": 42647, "merino</w>": 42648, "ftp</w>": 42649, "eastside</w>": 42650, ".~</w>": 42651, "nbl</w>": 42652, "mnleg</w>": 42653, "tsfor": 42654, "fraudul": 42655, "capping</w>": 42656, "inmy": 42657, "gymnast</w>": 42658, "stones": 42659, "ssin</w>": 42660, "tweaks</w>": 42661, "shaggy</w>": 42662, "oakland": 42663, "demsin": 42664, "sangria</w>": 42665, "mmva</w>": 42666, "hennessy</w>": 42667, "downton</w>": 42668, "rightly</w>": 42669, "init</w>": 42670, "agave</w>": 42671, "oblast</w>": 42672, "northeast": 42673, "friendship": 42674, "dala</w>": 42675, "trophy": 42676, "ðŁĳ½": 42677, "magin": 42678, "margaritas</w>": 42679, "ê·": 42680, "wwfc</w>": 42681, "fash": 42682, "dike</w>": 42683, "cud": 42684, "chart": 42685, "ðŁĳ®": 42686, "refugees": 42687, "joplin</w>": 42688, "ncs</w>": 42689, "impy</w>": 42690, "firmware</w>": 42691, "pascu": 42692, "flamin": 42693, "healthtech</w>": 42694, "bellletstalk</w>": 42695, "waka</w>": 42696, "olls</w>": 42697, "lago": 42698, "cowan</w>": 42699, "bombardier</w>": 42700, "shome</w>": 42701, "ðŁĻħ": 42702, "mcmaster</w>": 42703, "nave": 42704, "wells": 42705, "uta": 42706, "tellers</w>": 42707, "misfits</w>": 42708, "kapil</w>": 42709, "faceoff</w>": 42710, "affirm": 42711, "apro": 42712, "whitepaper</w>": 42713, "superyacht</w>": 42714, "specimens</w>": 42715, "allocated</w>": 42716, "...,</w>": 42717, "-__": 42718, "kaw</w>": 42719, "dachshund</w>": 42720, "djoker": 42721, "swork</w>": 42722, "quiere</w>": 42723, "orum</w>": 42724, "ðŁĲł</w>": 42725, "somm": 42726, "cmt</w>": 42727, "inghour</w>": 42728, "skinny": 42729, "lgbti</w>": 42730, "giggles</w>": 42731, "breakaway</w>": 42732, "researched</w>": 42733, "parity</w>": 42734, "myal": 42735, "msl</w>": 42736, "retained</w>": 42737, "sivity</w>": 42738, "makeinindia</w>": 42739, "solves</w>": 42740, "defamation</w>": 42741, "waltham": 42742, "sriracha</w>": 42743, "roadway</w>": 42744, "conceptu": 42745, "alin": 42746, "iwant": 42747, "åĪ": 42748, "delft</w>": 42749, "tenderloin</w>": 42750, "gains": 42751, "faults</w>": 42752, "swire</w>": 42753, "stellen": 42754, "pollo</w>": 42755, "dyne</w>": 42756, "bornonthisday</w>": 42757, "asdfghj": 42758, "sql": 42759, "salim</w>": 42760, "advises</w>": 42761, "voip</w>": 42762, "ìĹĳìĨ": 42763, "untouched</w>": 42764, "sheil": 42765, "ontario": 42766, "uphill</w>": 42767, "sobre</w>": 42768, "deshi</w>": 42769, "novella</w>": 42770, "dutton</w>": 42771, "crawfish</w>": 42772, "Ø§ÙĨ": 42773, "maa": 42774, "twine</w>": 42775, "kalin": 42776, "ðŁĩµðŁĩŃ</w>": 42777, "yess": 42778, "brooks": 42779, "hoosiers</w>": 42780, "tonka</w>": 42781, "umbrellas</w>": 42782, "ayers</w>": 42783, "ateam</w>": 42784, "acquiring</w>": 42785, "suction</w>": 42786, "Ã¤n": 42787, "wies": 42788, "tarians</w>": 42789, "socio</w>": 42790, "mattb": 42791, "shepherds</w>": 42792, "oso": 42793, "charitytuesday</w>": 42794, "slogans</w>": 42795, "ninjas</w>": 42796, "albat": 42797, "byte</w>": 42798, "bashir</w>": 42799, "trampoline</w>": 42800, "mydayinla</w>": 42801, "ija</w>": 42802, "basel": 42803, "rory": 42804, "goldie</w>": 42805, "firec": 42806, "unnoticed</w>": 42807, "peculiar</w>": 42808, "scha": 42809, "kerson</w>": 42810, "mourns</w>": 42811, "liquidity</w>": 42812, "quipment</w>": 42813, "hibs</w>": 42814, "ars": 42815, "aeronau": 42816, "slideshow</w>": 42817, "slabs</w>": 42818, "deliciousness</w>": 42819, "skitchen</w>": 42820, "htafc</w>": 42821, "fullerton</w>": 42822, "creighton</w>": 42823, "aerob": 42824, "procrastination</w>": 42825, "azores</w>": 42826, "whitehall</w>": 42827, "ussoccer</w>": 42828, "mediation</w>": 42829, "djokernole</w>": 42830, "andme</w>": 42831, "umen</w>": 42832, "noxious</w>": 42833, "joss</w>": 42834, "ilife</w>": 42835, "annivers": 42836, "sudanese</w>": 42837, "etres</w>": 42838, "undermine</w>": 42839, "wholefoods</w>": 42840, "disobe": 42841, "kori</w>": 42842, "adele": 42843, "eliz": 42844, "canti": 42845, "alon</w>": 42846, "gymnasium</w>": 42847, "sarkodie</w>": 42848, "meteorologist</w>": 42849, "ylde</w>": 42850, "steen": 42851, "stampcollecting</w>": 42852, "nasal</w>": 42853, "lott</w>": 42854, "franks</w>": 42855, "exol</w>": 42856, "acki</w>": 42857, "goodyear</w>": 42858, "animalrights</w>": 42859, "yles</w>": 42860, "violets</w>": 42861, "mmes</w>": 42862, "sthel": 42863, "rapping</w>": 42864, "tuscan</w>": 42865, "waiver</w>": 42866, "turner": 42867, "eatlocal</w>": 42868, "northeasthour</w>": 42869, "animations</w>": 42870, "tommorow</w>": 42871, "tsh": 42872, "ffame</w>": 42873, "brae": 42874, "petron": 42875, "glamour": 42876, "bryn</w>": 42877, "dcs</w>": 42878, "bales</w>": 42879, "ðŁĶ¶": 42880, "brov": 42881, "brev</w>": 42882, "bons</w>": 42883, "physique</w>": 42884, "carne</w>": 42885, "xe": 42886, "elixir</w>": 42887, "volved</w>": 42888, "loma</w>": 42889, "ìľł": 42890, "æĺ": 42891, "vanu": 42892, "rigs</w>": 42893, "balance": 42894, "vares</w>": 42895, "bonita</w>": 42896, "sprinkle</w>": 42897, "perfecto</w>": 42898, "dion": 42899, "leak": 42900, "calcutta</w>": 42901, "oba": 42902, "dma</w>": 42903, "cmon</w>": 42904, "tuner</w>": 42905, "pneumonia</w>": 42906, "bogus</w>": 42907, "apologe": 42908, "clough</w>": 42909, "borne": 42910, "))))": 42911, "revived</w>": 42912, "ovarian</w>": 42913, "nerf</w>": 42914, "clegg</w>": 42915, "fanfest</w>": 42916, "chou</w>": 42917, "realizes</w>": 42918, "mcn": 42919, "ligu": 42920, "legalize</w>": 42921, "justsaying</w>": 42922, "forster</w>": 42923, "bosni": 42924, "khi</w>": 42925, "indom": 42926, "heidel": 42927, "encryp": 42928, "siss": 42929, "eddi": 42930, "marbles</w>": 42931, "brisbane": 42932, "ying": 42933, "prepaid</w>": 42934, "walsall</w>": 42935, "cooperate</w>": 42936, "orchestr": 42937, "marisa</w>": 42938, "howie</w>": 42939, "chewy</w>": 42940, "brenner</w>": 42941, "andromeda</w>": 42942, "egan</w>": 42943, "stocki": 42944, "cavendish</w>": 42945, "agan": 42946, "bano</w>": 42947, "deir": 42948, "gog</w>": 42949, "blk": 42950, "rethinking</w>": 42951, "chig": 42952, "rheu": 42953, "snip</w>": 42954, "peng": 42955, "seminole</w>": 42956, "mswx</w>": 42957, "annex": 42958, "lynda</w>": 42959, "lewishamilton</w>": 42960, "cumul": 42961, "tbl</w>": 42962, "dolphin": 42963, "aguero</w>": 42964, "............</w>": 42965, "prelude</w>": 42966, "atour</w>": 42967, "granger</w>": 42968, "tooting</w>": 42969, "rotun": 42970, "disar": 42971, "homeitems</w>": 42972, "dares</w>": 42973, "********": 42974, "ðŁĳĨ": 42975, "compreh": 42976, "jinx</w>": 42977, "aswell</w>": 42978, "irie</w>": 42979, "circulating</w>": 42980, "ðŁĲ¥</w>": 42981, "overboard</w>": 42982, "cultivate</w>": 42983, "rhett</w>": 42984, "orienteering</w>": 42985, "cak</w>": 42986, "balkans</w>": 42987, "sitt": 42988, "jasmin": 42989, "britneyspears</w>": 42990, "rotor</w>": 42991, "sealing</w>": 42992, "gbc</w>": 42993, "occi": 42994, "fas</w>": 42995, "emancip": 42996, "comer": 42997, "wartime</w>": 42998, "tickle</w>": 42999, "sonny": 43000, "paces</w>": 43001, "logg</w>": 43002, "atrix</w>": 43003, "srp</w>": 43004, "gwin": 43005, "dobbs</w>": 43006, "uzbe": 43007, "thewanted</w>": 43008, "drush</w>": 43009, "extru": 43010, "micky</w>": 43011, "honorees</w>": 43012, "darwin": 43013, "redux</w>": 43014, "mmj</w>": 43015, "rami</w>": 43016, "jalapeÃ±o</w>": 43017, "ioc</w>": 43018, "dover": 43019, "juju</w>": 43020, "whitney": 43021, "seng": 43022, "enly</w>": 43023, "auch</w>": 43024, "archipelago</w>": 43025, "vigilant</w>": 43026, "mangal": 43027, "wildest</w>": 43028, "paranoid</w>": 43029, "hali</w>": 43030, "bbly</w>": 43031, "sanctioned</w>": 43032, "realms</w>": 43033, "conco": 43034, "uddin</w>": 43035, "csk</w>": 43036, "playtime</w>": 43037, "libra</w>": 43038, "savag": 43039, "octane</w>": 43040, "rectan": 43041, "return": 43042, "parrish</w>": 43043, "morrha": 43044, "ccp</w>": 43045, "cmu</w>": 43046, "sailed</w>": 43047, "sevent": 43048, "rosie": 43049, "piling</w>": 43050, "hew</w>": 43051, "boarded</w>": 43052, "segments</w>": 43053, "nephro": 43054, "(.</w>": 43055, "crats</w>": 43056, "bakes</w>": 43057, "ðŁį¸": 43058, "backtothe": 43059, "sibling": 43060, "kirkland</w>": 43061, "keo": 43062, "guwa": 43063, "breads</w>": 43064, "ðŁĺľðŁĺľ</w>": 43065, "tq</w>": 43066, "harassed</w>": 43067, "gau</w>": 43068, "wilbur</w>": 43069, "jisoo</w>": 43070, "eper</w>": 43071, "lisam": 43072, "trippin</w>": 43073, "shino": 43074, "rukh": 43075, "beastmode</w>": 43076, "choa</w>": 43077, "instaweather": 43078, "richland</w>": 43079, "gari</w>": 43080, "fez</w>": 43081, "cowboysnation</w>": 43082, "fursuit</w>": 43083, "krun": 43084, "aen": 43085, "sycamore</w>": 43086, "segun": 43087, "entennial</w>": 43088, "dih</w>": 43089, "oax": 43090, "demsinphilly</w>": 43091, "ðŁĻĢ</w>": 43092, "snhl</w>": 43093, "pennies</w>": 43094, "passwords</w>": 43095, "makin": 43096, "tye</w>": 43097, "deng": 43098, "knigh": 43099, "jeeplife</w>": 43100, "helpline</w>": 43101, "afor": 43102, "zzzz</w>": 43103, "steamy</w>": 43104, "picker</w>": 43105, "iterate</w>": 43106, "happeningnow</w>": 43107, "kib": 43108, "bloomberg": 43109, "martyrdom</w>": 43110, "bully": 43111, "assortment</w>": 43112, "ahora</w>": 43113, "zoe": 43114, "noi": 43115, "illustri": 43116, "agarwal</w>": 43117, "psc</w>": 43118, "electronica</w>": 43119, "recruiter</w>": 43120, "gardiner</w>": 43121, "radha</w>": 43122, "nafta</w>": 43123, "dotnet</w>": 43124, "piero</w>": 43125, "georg</w>": 43126, "bels</w>": 43127, "ðŁĺĤðŁĺį</w>": 43128, "tuberculosis</w>": 43129, "runnin</w>": 43130, "moris": 43131, "hauling</w>": 43132, "evoc": 43133, "brethren</w>": 43134, "shair</w>": 43135, "frameworks</w>": 43136, "astu": 43137, "rigid</w>": 43138, "kuma": 43139, "kreme</w>": 43140, "jinnah</w>": 43141, "insurers</w>": 43142, "nyu": 43143, "fere</w>": 43144, "nollywood</w>": 43145, "goodvibes</w>": 43146, "-...</w>": 43147, "toile": 43148, "skril": 43149, "instaweatherpro</w>": 43150, "czech": 43151, "pavel</w>": 43152, "onepiece</w>": 43153, "nikeplus</w>": 43154, "filet</w>": 43155, "cavity</w>": 43156, "ðŁı½âĢįâĻĤï¸ı</w>": 43157, "ðŁİ£</w>": 43158, "drastic</w>": 43159, "dailys": 43160, "siamese</w>": 43161, "rebu": 43162, "osteo": 43163, "lark": 43164, "fre</w>": 43165, "shelling</w>": 43166, "pÃ©": 43167, "gladys</w>": 43168, "ðŁıĢðŁıĢ": 43169, "gustave</w>": 43170, "submerged</w>": 43171, "grandstand</w>": 43172, "attu</w>": 43173, "wont": 43174, "fpv</w>": 43175, "bley</w>": 43176, "joni</w>": 43177, "angames</w>": 43178, "weighted</w>": 43179, "alou": 43180, "à¤¶</w>": 43181, "lesbians</w>": 43182, "fj": 43183, "annies</w>": 43184, "aml</w>": 43185, "doria</w>": 43186, "davin": 43187, "beta": 43188, "canc": 43189, "madewithunity</w>": 43190, "haj</w>": 43191, "badlands</w>": 43192, "mul</w>": 43193, "bluec": 43194, "pawn</w>": 43195, "covington</w>": 43196, "neurology</w>": 43197, "httweets</w>": 43198, "dyslexia</w>": 43199, "thelove</w>": 43200, "neat": 43201, "forklift</w>": 43202, "automate</w>": 43203, "uneven</w>": 43204, "montess": 43205, "hein</w>": 43206, "hag</w>": 43207, "relics</w>": 43208, "competitiveness</w>": 43209, "canelo</w>": 43210, "martens</w>": 43211, "bulletproof</w>": 43212, "skittles</w>": 43213, "gya</w>": 43214, "primo</w>": 43215, "americafirst</w>": 43216, "wooo</w>": 43217, "abortions</w>": 43218, "??!!</w>": 43219, "mache": 43220, "lders</w>": 43221, "rlly</w>": 43222, "prelims</w>": 43223, "direct": 43224, "course": 43225, "swain</w>": 43226, "supercell</w>": 43227, "eccentric</w>": 43228, "stingray</w>": 43229, "plets</w>": 43230, "wilcox</w>": 43231, "westin</w>": 43232, "okanagan</w>": 43233, "kiran": 43234, "carbo": 43235, "bombings</w>": 43236, "rarest</w>": 43237, "boh": 43238, "gawd</w>": 43239, "digg": 43240, "moana</w>": 43241, "entirety</w>": 43242, "enclosed</w>": 43243, "dodgeball</w>": 43244, "parton</w>": 43245, "milkyway</w>": 43246, "atr</w>": 43247, "thoroughbred</w>": 43248, "really": 43249, "qantas</w>": 43250, "epiphany</w>": 43251, "inee": 43252, "aerosmith</w>": 43253, "spieth</w>": 43254, "arthro": 43255, "ellini</w>": 43256, "dubu": 43257, "braving</w>": 43258, "âļ½âļ½": 43259, "restructuring</w>": 43260, "illuminate</w>": 43261, "equili": 43262, "mpi</w>": 43263, "ashton": 43264, "ponytail</w>": 43265, "mascots</w>": 43266, "flattering</w>": 43267, "crum": 43268, "asta</w>": 43269, "à®°</w>": 43270, "strangerthings</w>": 43271, "barnab": 43272, "Ø±ÙĬ": 43273, "makeshift</w>": 43274, "gotcha</w>": 43275, "willam": 43276, "choirs</w>": 43277, "kilometres</w>": 43278, "ghosh</w>": 43279, "euthan": 43280, "dolly": 43281, "unning</w>": 43282, "thear": 43283, "crewe</w>": 43284, "wsw</w>": 43285, "jace</w>": 43286, "dismiss</w>": 43287, "kean": 43288, "hota</w>": 43289, "khat": 43290, "~></w>": 43291, "thiru": 43292, "rendez": 43293, "hartman</w>": 43294, "teessi": 43295, "casca": 43296, "zah</w>": 43297, "hydrange": 43298, "fod</w>": 43299, "awp</w>": 43300, "mzansi</w>": 43301, "thicker</w>": 43302, "nagoya</w>": 43303, "neva</w>": 43304, "stique</w>": 43305, "castel": 43306, "damian": 43307, "thereby</w>": 43308, "jiang": 43309, "alek</w>": 43310, "musicislife</w>": 43311, "raq</w>": 43312, "callahan</w>": 43313, "gouache</w>": 43314, "somaliland</w>": 43315, "seanhannity</w>": 43316, "raheem</w>": 43317, "lose": 43318, "elove": 43319, "wharton</w>": 43320, "rectangular</w>": 43321, "illustrating</w>": 43322, "harne": 43323, "autisma": 43324, "scrapped</w>": 43325, "elland</w>": 43326, "decree</w>": 43327, "nagpur</w>": 43328, "kipp": 43329, "sore": 43330, "nmd</w>": 43331, "maas": 43332, "guna</w>": 43333, "gartner": 43334, "belli": 43335, "thenight</w>": 43336, "jeon</w>": 43337, "genderequality</w>": 43338, "giver</w>": 43339, "ael</w>": 43340, "garments</w>": 43341, "neu</w>": 43342, "mardigras</w>": 43343, "marsden</w>": 43344, "rower</w>": 43345, "polluted</w>": 43346, "cameraman</w>": 43347, "vinod</w>": 43348, "beasley</w>": 43349, "croc</w>": 43350, "jiu": 43351, "hollyoaks</w>": 43352, "anesthesia</w>": 43353, "alles</w>": 43354, "steward</w>": 43355, "latimes</w>": 43356, "ðŁĩºðŁĩ¸ðŁĩºðŁĩ¸ðŁĩºðŁĩ¸</w>": 43357, "tician</w>": 43358, "goria</w>": 43359, "comedic</w>": 43360, "ðŁ¤ĶðŁ¤ĶðŁ¤Ķ</w>": 43361, "naive</w>": 43362, "slions</w>": 43363, "łĪ": 43364, "burglar</w>": 43365, "ðŁĺŃðŁĺŃðŁĺŃðŁĺŃðŁĺŃ</w>": 43366, "yorkshi": 43367, "seÃ±": 43368, "fanboy</w>": 43369, "laurel": 43370, "incidence</w>": 43371, "potomac</w>": 43372, "roberta</w>": 43373, "presiden": 43374, "pryor</w>": 43375, "osbourne</w>": 43376, "wku</w>": 43377, "teme": 43378, "palae": 43379, "ðŁ¥º": 43380, "reboun": 43381, "itude": 43382, "reddish</w>": 43383, "khand": 43384, "colonialism</w>": 43385, "northcarolina</w>": 43386, "ðĿĴ": 43387, "mannequin</w>": 43388, "ladybird</w>": 43389, "tasty": 43390, "knowledgeable</w>": 43391, "gshore</w>": 43392, "ðŁĮĮ</w>": 43393, "à®©</w>": 43394, "quaker</w>": 43395, "salzburg</w>": 43396, "medalists</w>": 43397, "chyna</w>": 43398, "bridesmaid</w>": 43399, "maori</w>": 43400, "rop</w>": 43401, "outraged</w>": 43402, "inadequate</w>": 43403, "truckers</w>": 43404, "alana</w>": 43405, "ìĿ¼": 43406, "rix": 43407, "oooooooo</w>": 43408, "commandments</w>": 43409, "lambeth</w>": 43410, "aaj</w>": 43411, "ecofriendly</w>": 43412, "blaz": 43413, "morecambe</w>": 43414, "bouncy</w>": 43415, "roux</w>": 43416, "raided</w>": 43417, "mized</w>": 43418, "shc</w>": 43419, "gawx</w>": 43420, "laboratories</w>": 43421, "rubs</w>": 43422, "restroom</w>": 43423, "consultations</w>": 43424, "cajun": 43425, "virgini": 43426, "soir</w>": 43427, "revue</w>": 43428, "plein</w>": 43429, "wager</w>": 43430, "ç¹": 43431, "wedo</w>": 43432, "growingup": 43433, "!ðŁĺĬ</w>": 43434, "faceted</w>": 43435, "sinners</w>": 43436, "hovering</w>": 43437, "tiene</w>": 43438, "seasoning</w>": 43439, "anja</w>": 43440, "leggo</w>": 43441, "ilis</w>": 43442, "flax</w>": 43443, "devo</w>": 43444, "ashram</w>": 43445, "matisse</w>": 43446, "keri</w>": 43447, "gower</w>": 43448, "botox</w>": 43449, "marshes</w>": 43450, "unhcr</w>": 43451, "tsm</w>": 43452, "optimus</w>": 43453, "duni</w>": 43454, "stuffs</w>": 43455, "sok</w>": 43456, "orderly</w>": 43457, "nbad": 43458, "islamophobia</w>": 43459, "ravioli</w>": 43460, "faber": 43461, "creds</w>": 43462, "wonka</w>": 43463, "infusion</w>": 43464, "overweight</w>": 43465, "dailynews</w>": 43466, "assimil": 43467, "acollege</w>": 43468, "medallion</w>": 43469, "kilimanjaro</w>": 43470, "stiff": 43471, "thames": 43472, "sunken</w>": 43473, "thard</w>": 43474, "mydubai</w>": 43475, "hilariously</w>": 43476, "hannel</w>": 43477, "plumber</w>": 43478, "fairview</w>": 43479, "separating</w>": 43480, "rascal</w>": 43481, "quien</w>": 43482, "necessities</w>": 43483, "confederation</w>": 43484, "llll</w>": 43485, ":]</w>": 43486, "weaknesses</w>": 43487, "bronco": 43488, "raffles</w>": 43489, "elot</w>": 43490, "ãĤ¸ãĥ": 43491, "adventcalendar</w>": 43492, "ðŁİ¹</w>": 43493, "stravel</w>": 43494, "tunic</w>": 43495, "ksu</w>": 43496, "impeach</w>": 43497, "espionage</w>": 43498, "!-</w>": 43499, "diment</w>": 43500, "currant</w>": 43501, "biode": 43502, "commuting</w>": 43503, "byron": 43504, "ðŁĴĵðŁĴĵ</w>": 43505, "shaded</w>": 43506, "truro</w>": 43507, "crayons</w>": 43508, "arne</w>": 43509, "hsc</w>": 43510, "freaked</w>": 43511, "dramati": 43512, "fleek</w>": 43513, "ucd</w>": 43514, "marlborough</w>": 43515, "^-": 43516, "crossings</w>": 43517, "malo</w>": 43518, "blackops</w>": 43519, "binance</w>": 43520, "choked</w>": 43521, "cheney</w>": 43522, "plo</w>": 43523, "gestures</w>": 43524, "valedic": 43525, "ryanair</w>": 43526, "remington</w>": 43527, "vcs</w>": 43528, "mckee</w>": 43529, "ecz": 43530, "begs</w>": 43531, "nailart</w>": 43532, "mayorof": 43533, "happyfathersday</w>": 43534, "wart": 43535, "petitions</w>": 43536, "ningly</w>": 43537, "cleanenergy</w>": 43538, "brox</w>": 43539, "slalom</w>": 43540, "existent</w>": 43541, "abay</w>": 43542, "ugliest</w>": 43543, "tomp": 43544, "stoma</w>": 43545, "selby</w>": 43546, "goalscorer</w>": 43547, "benji</w>": 43548, "overwhelmingly</w>": 43549, "lans": 43550, "semiconductor</w>": 43551, "southkorea</w>": 43552, "rescheduled</w>": 43553, "skyl": 43554, "enlisted</w>": 43555, "dowski</w>": 43556, "sidel": 43557, "rosenberg</w>": 43558, "nasser</w>": 43559, "whitehead</w>": 43560, "prius</w>": 43561, "harare</w>": 43562, "enn</w>": 43563, "ryder": 43564, "íĤ": 43565, "mong</w>": 43566, "clasico</w>": 43567, "transporter</w>": 43568, "potty</w>": 43569, "isme</w>": 43570, "*****</w>": 43571, "vice": 43572, "skit</w>": 43573, "odessa</w>": 43574, "lmp</w>": 43575, "hern</w>": 43576, "racially</w>": 43577, "pinoy": 43578, "paraguay</w>": 43579, "obituary</w>": 43580, "goes": 43581, "bucha</w>": 43582, "sidewalks</w>": 43583, "angular": 43584, "unconstitutional</w>": 43585, "transitioning</w>": 43586, "ibu": 43587, "guys": 43588, "unpacking</w>": 43589, "oooooo": 43590, "blackgirl": 43591, "bergs</w>": 43592, "Â¯</w>": 43593, "wordoftheday</w>": 43594, "trumptrain</w>": 43595, "thunderbolt</w>": 43596, "msi": 43597, "fascists</w>": 43598, "à¤¬</w>": 43599, "tsk</w>": 43600, "collapses</w>": 43601, "rajesh</w>": 43602, "loveislove</w>": 43603, "migrating</w>": 43604, "setback</w>": 43605, "ðŁĺĬâĿ¤ï¸ı</w>": 43606, "tels</w>": 43607, "safetyfirst</w>": 43608, "narrated</w>": 43609, "jaejoong</w>": 43610, "unanswered</w>": 43611, "liqueur</w>": 43612, "ennes</w>": 43613, "dalgo</w>": 43614, "billings</w>": 43615, "saltwater</w>": 43616, "mermaids</w>": 43617, "longs</w>": 43618, "clapham</w>": 43619, "wearec": 43620, "piccollage</w>": 43621, "nach</w>": 43622, "hace</w>": 43623, "poisoned</w>": 43624, "loth</w>": 43625, "agna</w>": 43626, "adelrey</w>": 43627, "guardia</w>": 43628, "polishing</w>": 43629, "peacekeeping</w>": 43630, "dall</w>": 43631, "pisa</w>": 43632, "lapland</w>": 43633, "processors</w>": 43634, "deandre</w>": 43635, "sobs</w>": 43636, "ponce</w>": 43637, "drains</w>": 43638, "cbe</w>": 43639, "ðŁİ¥:</w>": 43640, "splash": 43641, "meatball</w>": 43642, "fontana</w>": 43643, "worcestershirehour</w>": 43644, "nev</w>": 43645, "brisk</w>": 43646, "bint": 43647, "acr</w>": 43648, "pox</w>": 43649, "cayenne</w>": 43650, "skrillex</w>": 43651, "jfc</w>": 43652, "hahahahahahaha</w>": 43653, "glas</w>": 43654, "engul": 43655, "temporal</w>": 43656, "onized</w>": 43657, "concre": 43658, "compose</w>": 43659, "vibrations</w>": 43660, "planters</w>": 43661, "fert</w>": 43662, "criticalrolefanart</w>": 43663, "tbli": 43664, "schallenge</w>": 43665, "huckabee</w>": 43666, "municipal": 43667, "iambic": 43668, "radios</w>": 43669, "nevis</w>": 43670, "durability</w>": 43671, "mccla": 43672, "horseback</w>": 43673, "institutes</w>": 43674, "fulfill": 43675, "attach": 43676, "ateur</w>": 43677, "akan": 43678, "resisting</w>": 43679, "illumination</w>": 43680, "handle": 43681, "haircare</w>": 43682, "oment</w>": 43683, "macleod</w>": 43684, "kaiser": 43685, "gno</w>": 43686, "beardown</w>": 43687, "lyf</w>": 43688, "glomer": 43689, "distortion</w>": 43690, "zm</w>": 43691, "sank</w>": 43692, "roosters</w>": 43693, "isnow</w>": 43694, "asports</w>": 43695, "agen</w>": 43696, "woken</w>": 43697, "stgeorge": 43698, "romper</w>": 43699, "myle": 43700, "economists</w>": 43701, "ruto</w>": 43702, "twill</w>": 43703, "healthand": 43704, "dito</w>": 43705, "wsl</w>": 43706, "tairp</w>": 43707, "prakash": 43708, "micheal</w>": 43709, "hts</w>": 43710, "wrights</w>": 43711, "katsu</w>": 43712, "fiorentina</w>": 43713, "defenseman</w>": 43714, "ditch": 43715, "varsity": 43716, "texanscheer</w>": 43717, "baham": 43718, "scanned</w>": 43719, "weil</w>": 43720, "seductive</w>": 43721, "ðŁĳįðŁı½</w>": 43722, "fue</w>": 43723, "erwin</w>": 43724, "davison</w>": 43725, "terran": 43726, "moods</w>": 43727, "woolf</w>": 43728, "resource": 43729, "@.</w>": 43730, "cush": 43731, "ðŁį°": 43732, "regression</w>": 43733, "curled</w>": 43734, "lazer</w>": 43735, "joanne": 43736, "abbott": 43737, "moz</w>": 43738, "downers</w>": 43739, "mmmmmm</w>": 43740, "valentina</w>": 43741, "khair": 43742, "dreamt</w>": 43743, "crook</w>": 43744, "chek</w>": 43745, "steaming</w>": 43746, "nephews</w>": 43747, "cleric</w>": 43748, "asober": 43749, "indefinitely</w>": 43750, "wye</w>": 43751, "usnews</w>": 43752, "joyce": 43753, "flushing</w>": 43754, "wynonnaearp</w>": 43755, "rondo</w>": 43756, "kiss": 43757, "hotdog</w>": 43758, "barns</w>": 43759, "saxophon": 43760, "farley</w>": 43761, "gasp</w>": 43762, "decreasing</w>": 43763, "alway</w>": 43764, "pex</w>": 43765, "lsd</w>": 43766, "shift": 43767, "poutine</w>": 43768, "razz": 43769, "rescuing</w>": 43770, "niko</w>": 43771, "hoch": 43772, "ccl</w>": 43773, "uaap</w>": 43774, "nts</w>": 43775, "mcar": 43776, "ilwx</w>": 43777, "conquering</w>": 43778, "kettering</w>": 43779, "sturdy</w>": 43780, "delaying</w>": 43781, "stok</w>": 43782, "vanished</w>": 43783, "cathar": 43784, "bingham": 43785, "inv</w>": 43786, "ichiro</w>": 43787, "hemo": 43788, "budgeting</w>": 43789, "[...]</w>": 43790, "bess</w>": 43791, "sebastian": 43792, "slowed</w>": 43793, "ðĿĳ": 43794, "muslim": 43795, "stuns</w>": 43796, "actonclimate</w>": 43797, "vea</w>": 43798, "seton</w>": 43799, "rosetta</w>": 43800, "ount": 43801, "hardin</w>": 43802, "fluid": 43803, "caw</w>": 43804, "ðŁ¥Ĥ": 43805, "yacht": 43806, "unl</w>": 43807, "sphy": 43808, "provocative</w>": 43809, "oric</w>": 43810, "isback</w>": 43811, "___": 43812, "nicolas": 43813, "gyan</w>": 43814, "loose": 43815, "flin</w>": 43816, "rebate</w>": 43817, ":::</w>": 43818, "!\"@</w>": 43819, "comicon</w>": 43820, "sheff</w>": 43821, "downstream</w>": 43822, "chichester</w>": 43823, "beachlife</w>": 43824, "momlife</w>": 43825, "diabete": 43826, "arra</w>": 43827, "vane</w>": 43828, "oku</w>": 43829, "yeo</w>": 43830, "mango": 43831, "tryout</w>": 43832, "appell": 43833, "heirs</w>": 43834, "arjuna</w>": 43835, "ddu</w>": 43836, "naveen</w>": 43837, "movic</w>": 43838, "socialists</w>": 43839, "sback</w>": 43840, "criterion</w>": 43841, "soyuz</w>": 43842, "kher</w>": 43843, "daz</w>": 43844, "yolanda</w>": 43845, "wineoclock</w>": 43846, "reina</w>": 43847, "onew</w>": 43848, "leonard": 43849, "endez</w>": 43850, "ubs</w>": 43851, "supportlocal</w>": 43852, "facilitated</w>": 43853, "caramelized</w>": 43854, "bpa</w>": 43855, "vuelta</w>": 43856, "mytho": 43857, "mami</w>": 43858, "speare</w>": 43859, "nbaplayoffs</w>": 43860, "fevre</w>": 43861, "nickjonas</w>": 43862, "imprint</w>": 43863, "cso</w>": 43864, "craigslist</w>": 43865, "lasalle</w>": 43866, "gideon</w>": 43867, "hadoop</w>": 43868, "disregard</w>": 43869, "wud</w>": 43870, "tuc</w>": 43871, "magee</w>": 43872, "acoustics</w>": 43873, "taa</w>": 43874, "quie": 43875, "pola</w>": 43876, "crt</w>": 43877, "dwyer</w>": 43878, "dissec": 43879, "capitol": 43880, "mention": 43881, "knoll</w>": 43882, "heigh": 43883, "finders</w>": 43884, "placements</w>": 43885, "lse</w>": 43886, "indira</w>": 43887, "guri</w>": 43888, "madhuridixit</w>": 43889, "kingdoms</w>": 43890, "iambicpent</w>": 43891, "georgina</w>": 43892, "jeky": 43893, "conflicting</w>": 43894, "bayan</w>": 43895, "agatha</w>": 43896, "uphold</w>": 43897, "dron": 43898, "vicar</w>": 43899, "expat</w>": 43900, "peripheral</w>": 43901, "pessi": 43902, "faf</w>": 43903, "ancestor</w>": 43904, "?..</w>": 43905, "widget</w>": 43906, "punc": 43907, "commenced</w>": 43908, "beavs</w>": 43909, "airwaves</w>": 43910, "addis</w>": 43911, "poa</w>": 43912, "desses</w>": 43913, "coden": 43914, "vue": 43915, "rupee</w>": 43916, "karin</w>": 43917, "spock</w>": 43918, "msy</w>": 43919, "à¸°": 43920, "prick": 43921, "fillmore</w>": 43922, "tification</w>": 43923, "thingsto": 43924, "sarde": 43925, "emile</w>": 43926, "pereira</w>": 43927, "nad</w>": 43928, "brightening</w>": 43929, "arresting</w>": 43930, "woking</w>": 43931, "uscg</w>": 43932, "spill": 43933, "raspberrypi</w>": 43934, "hugo": 43935, "itec</w>": 43936, "isma": 43937, "cufflinks</w>": 43938, "optimized</w>": 43939, "occ</w>": 43940, "miwx</w>": 43941, "enka</w>": 43942, "elited": 43943, "affordable": 43944, "sakh</w>": 43945, "coronado</w>": 43946, "hoh</w>": 43947, "atul</w>": 43948, "aioli</w>": 43949, "jimcantore</w>": 43950, "accounted</w>": 43951, "vinay": 43952, "hermit</w>": 43953, "grooves</w>": 43954, "ranch": 43955, "rilla": 43956, "wetter</w>": 43957, "outof": 43958, "veterin": 43959, "nikov</w>": 43960, "kian": 43961, "fairbanks</w>": 43962, "ramapho": 43963, "niti": 43964, "kko</w>": 43965, "rusty": 43966, "nestle</w>": 43967, "tvxq</w>": 43968, "shaheer</w>": 43969, "âĿ¤âĿ¤âĿ¤âĿ¤</w>": 43970, "pennant</w>": 43971, "gemstones</w>": 43972, "demdebate</w>": 43973, "ðŁĲĬ</w>": 43974, "autonews</w>": 43975, "supportindiefilm</w>": 43976, "macho</w>": 43977, "vex</w>": 43978, "newsat</w>": 43979, "neti": 43980, "concessions</w>": 43981, "candied</w>": 43982, "yofthe": 43983, "macau": 43984, "dends</w>": 43985, "cricketers</w>": 43986, "saniti": 43987, "mariano</w>": 43988, "ghat</w>": 43989, "artoftheday</w>": 43990, "¡ľ": 43991, "egos</w>": 43992, "genoa</w>": 43993, "chatbots</w>": 43994, "brier</w>": 43995, "allabout": 43996, "monty": 43997, "spied</w>": 43998, "rtr</w>": 43999, "comfort": 44000, "snippets</w>": 44001, "realtime": 44002, "grain": 44003, "examined</w>": 44004, "enlightening</w>": 44005, "ttu</w>": 44006, "godbless</w>": 44007, "releasethe": 44008, "singular</w>": 44009, "kians</w>": 44010, "haka</w>": 44011, "sorren": 44012, "defect</w>": 44013, "marg</w>": 44014, "equities</w>": 44015, "dorian</w>": 44016, "suka</w>": 44017, "perl": 44018, "aishwarya</w>": 44019, "pullover</w>": 44020, "precision": 44021, "fairway</w>": 44022, "neve</w>": 44023, "riveting</w>": 44024, "villanova</w>": 44025, "encom": 44026, "ako": 44027, "passionately</w>": 44028, "europaleague</w>": 44029, "siempre</w>": 44030, "xvi</w>": 44031, "enlightened</w>": 44032, "cfr</w>": 44033, "âĺħâĺħâĺħâĺħ</w>": 44034, "wasteland</w>": 44035, "isf</w>": 44036, "newcomers</w>": 44037, "emergency": 44038, "amphitheatre</w>": 44039, "-.</w>": 44040, "textbooks</w>": 44041, "figurative</w>": 44042, "tremb": 44043, "pesc": 44044, "abhin": 44045, "abbot</w>": 44046, "acacia</w>": 44047, "hards</w>": 44048, "porsche": 44049, "kauai</w>": 44050, "elisa</w>": 44051, "carrick</w>": 44052, "abou</w>": 44053, "ellier</w>": 44054, "bech": 44055, "neutron</w>": 44056, "galapagos</w>": 44057, "ruben": 44058, "innis": 44059, "howto</w>": 44060, "nuns</w>": 44061, "sabine</w>": 44062, "iac</w>": 44063, "clinched</w>": 44064, "notori": 44065, "fives</w>": 44066, "cairngor": 44067, "peri</w>": 44068, "grc</w>": 44069, "ðŁĴ¯ðŁĴ¯</w>": 44070, "malm": 44071, "twelfth</w>": 44072, "diff": 44073, "routines</w>": 44074, "martyn</w>": 44075, "linden": 44076, "synthesizer</w>": 44077, "number": 44078, "gamecube</w>": 44079, "falkirk</w>": 44080, "byzantine</w>": 44081, "queuing</w>": 44082, "grill": 44083, "scalable</w>": 44084, "charred</w>": 44085, "routing</w>": 44086, "herbali": 44087, "grizz</w>": 44088, "ðŁĺŃðŁĺŃðŁĺŃ": 44089, "toll": 44090, "terminals</w>": 44091, "lpc</w>": 44092, "abd</w>": 44093, "warmups</w>": 44094, "removable</w>": 44095, "Â¯\\": 44096, "vigo</w>": 44097, "papaya</w>": 44098, "neve": 44099, "lovingly</w>": 44100, "jokers</w>": 44101, "ibles</w>": 44102, "ssett</w>": 44103, "potenti": 44104, "pele</w>": 44105, "gigi": 44106, "sadiq</w>": 44107, "legacy": 44108, "sono</w>": 44109, "rupees</w>": 44110, "retarded</w>": 44111, "elee</w>": 44112, "parr</w>": 44113, "fiance</w>": 44114, "eyre</w>": 44115, "sayers</w>": 44116, "pendants</w>": 44117, "maknae</w>": 44118, "albans</w>": 44119, "adapting</w>": 44120, "pff</w>": 44121, "puberty</w>": 44122, "jiu</w>": 44123, "ingrad</w>": 44124, "hypocrite</w>": 44125, "diplomats</w>": 44126, "physical": 44127, "robby</w>": 44128, "bonsai</w>": 44129, "ãģ·": 44130, "fatt": 44131, "catalunya</w>": 44132, "âľĸï¸ı</w>": 44133, "roma": 44134, "moreland</w>": 44135, "soe": 44136, "conversions</w>": 44137, "stlblues</w>": 44138, "sholm</w>": 44139, "grassy</w>": 44140, "prado</w>": 44141, "onu": 44142, "assaulting</w>": 44143, ">_": 44144, "settes</w>": 44145, "disgraceful</w>": 44146, "aphra": 44147, "âļ½ï¸ıâļ½ï¸ı</w>": 44148, "à¤ª</w>": 44149, "kiln</w>": 44150, "goaltender</w>": 44151, "sru": 44152, "philanthropist</w>": 44153, "bals</w>": 44154, "thn": 44155, "studen": 44156, "sandoval</w>": 44157, "dogrescue</w>": 44158, "elions</w>": 44159, "assessed</w>": 44160, "largo</w>": 44161, "hectares</w>": 44162, "shrm</w>": 44163, "saif</w>": 44164, "cleavage</w>": 44165, "noches</w>": 44166, "nene</w>": 44167, "fatalities</w>": 44168, "curing</w>": 44169, "cleanser</w>": 44170, "ales": 44171, "pvp</w>": 44172, "southbank</w>": 44173, "pizzeria</w>": 44174, "marshals</w>": 44175, "knife": 44176, "andover</w>": 44177, "tblightning</w>": 44178, "srsly</w>": 44179, "oute</w>": 44180, "digimon</w>": 44181, "timesofindia</w>": 44182, "promethe": 44183, "lebo": 44184, "fsu": 44185, "witz": 44186, "revere</w>": 44187, "manas": 44188, "mamba</w>": 44189, "chica</w>": 44190, "guan": 44191, "exhibitor</w>": 44192, "csrracing</w>": 44193, "dere</w>": 44194, "xxxxx</w>": 44195, "gusta</w>": 44196, "storytime</w>": 44197, "stoney</w>": 44198, "organics</w>": 44199, "andu": 44200, "seam</w>": 44201, "minogue</w>": 44202, "anushkasharma</w>": 44203, "aba": 44204, "ðŁİĻï¸ı</w>": 44205, "ugandan</w>": 44206, "chromatic</w>": 44207, "assn</w>": 44208, "documentaries</w>": 44209, "sht</w>": 44210, "rupaul</w>": 44211, "loyd</w>": 44212, "kats</w>": 44213, "eus</w>": 44214, "itech": 44215, "medusa</w>": 44216, "panty": 44217, "kellogg</w>": 44218, "etto</w>": 44219, "tallade": 44220, "shaa</w>": 44221, "dost</w>": 44222, "pms</w>": 44223, "mariana</w>": 44224, "jester</w>": 44225, "crooks</w>": 44226, "ðŁĶ¬</w>": 44227, "mindanao</w>": 44228, "indhoven</w>": 44229, "ðŁ¤ª": 44230, "lexi": 44231, "tvn</w>": 44232, "janis</w>": 44233, "cote": 44234, "ãģĨ": 44235, "serrano</w>": 44236, "iwm</w>": 44237, "ðŁĲ¬</w>": 44238, "kke</w>": 44239, "distributors</w>": 44240, "capu": 44241, "counterfeit</w>": 44242, "campsite</w>": 44243, "aggie": 44244, "ðŁĺ¼</w>": 44245, "chhattisgarh</w>": 44246, "~@</w>": 44247, "stateu</w>": 44248, "sandi": 44249, "preventable</w>": 44250, "cls</w>": 44251, "canne": 44252, "mmc</w>": 44253, "iver</w>": 44254, "saharan</w>": 44255, "palis": 44256, "nightout</w>": 44257, "dos": 44258, "apia</w>": 44259, "abscbn": 44260, "managerial</w>": 44261, "arose</w>": 44262, "mowx</w>": 44263, "arosa</w>": 44264, "ðŁĮ³": 44265, "underdog</w>": 44266, "remover</w>": 44267, "astronomers</w>": 44268, "lentils</w>": 44269, "suscep": 44270, "smoother</w>": 44271, "pendleton</w>": 44272, "faucet</w>": 44273, "emory</w>": 44274, "dalmati": 44275, "afcb</w>": 44276, "ticus</w>": 44277, "exempt</w>": 44278, "enrol": 44279, "dheim</w>": 44280, "ðŁĲº": 44281, "restriction</w>": 44282, "starfish</w>": 44283, "stow": 44284, "snorkel": 44285, "thunderbirds</w>": 44286, "shead": 44287, "homosexual</w>": 44288, "dyn": 44289, "asli</w>": 44290, "andretti</w>": 44291, "douche</w>": 44292, "domo</w>": 44293, "tarmac</w>": 44294, "slumber</w>": 44295, "pronto</w>": 44296, "firstdayof": 44297, "miniature": 44298, "mariachi</w>": 44299, "argus</w>": 44300, "recommending</w>": 44301, "mobiles</w>": 44302, "ince</w>": 44303, "illustrious</w>": 44304, "orc</w>": 44305, "adverts</w>": 44306, "grits</w>": 44307, "weasel</w>": 44308, "pagoda</w>": 44309, "overpass</w>": 44310, "greys</w>": 44311, "maximus</w>": 44312, "armagh</w>": 44313, "woodland": 44314, "sunni</w>": 44315, "ðŁĴī": 44316, "ëĿ": 44317, "tione": 44318, "socio": 44319, "hos": 44320, "ðŁ¤ĹðŁ¤Ĺ</w>": 44321, "windsor": 44322, "subsequent</w>": 44323, "munchies</w>": 44324, "idh</w>": 44325, "excluding</w>": 44326, "emi": 44327, "cuth": 44328, "zai": 44329, "weekdays</w>": 44330, "lawsuits</w>": 44331, "barnard</w>": 44332, "Øª</w>": 44333, "petting</w>": 44334, "netes</w>": 44335, "mulligan</w>": 44336, "pharmacists</w>": 44337, "raquel</w>": 44338, "eton</w>": 44339, "cranston</w>": 44340, "gilded</w>": 44341, "cleary</w>": 44342, "ceph": 44343, "raa</w>": 44344, "pamper</w>": 44345, "lombardi</w>": 44346, "asin</w>": 44347, "sherry": 44348, "prod": 44349, "forte": 44350, "arianism</w>": 44351, "buffalobills</w>": 44352, "æľ¬": 44353, "ðŁĶ¥#</w>": 44354, "uuu": 44355, "justices</w>": 44356, "carina</w>": 44357, "natin</w>": 44358, "maslow</w>": 44359, "drooling</w>": 44360, "cognac</w>": 44361, "camber": 44362, "elong": 44363, "rdr</w>": 44364, "inen</w>": 44365, "convictions</w>": 44366, "amuse</w>": 44367, "trock</w>": 44368, "harmless</w>": 44369, "visitation</w>": 44370, "genomic</w>": 44371, "bland": 44372, "benoit</w>": 44373, "chimp</w>": 44374, "tuscaloosa</w>": 44375, "greasy</w>": 44376, "xpo</w>": 44377, "gilt</w>": 44378, "seq</w>": 44379, "permitted</w>": 44380, "christmaseve</w>": 44381, "books": 44382, "mue": 44383, "oldschool": 44384, "humanright": 44385, "beati": 44386, "ðŁĶĿ": 44387, "shat</w>": 44388, "sculpting</w>": 44389, "hwan</w>": 44390, "fernandes</w>": 44391, "sciutto</w>": 44392, "fuentes</w>": 44393, "endeavors</w>": 44394, "maidstone</w>": 44395, "unparalleled</w>": 44396, "shouted</w>": 44397, "queenof": 44398, "merc</w>": 44399, "bandic": 44400, "veda</w>": 44401, "selangor</w>": 44402, "pile": 44403, "jahan": 44404, "intimidating</w>": 44405, "disappears</w>": 44406, "clich": 44407, "zaha</w>": 44408, "wurst</w>": 44409, "hiv": 44410, "fodils</w>": 44411, "cordless</w>": 44412, "aaaaaa</w>": 44413, "hydra": 44414, "belinda</w>": 44415, "eels</w>": 44416, "buf": 44417, "sustaining</w>": 44418, "rugbyleague</w>": 44419, "noc</w>": 44420, "brigitte</w>": 44421, "(ðŁĵ¸:</w>": 44422, "trombone</w>": 44423, "soothe</w>": 44424, "smog</w>": 44425, "adp</w>": 44426, "stable": 44427, "ingley</w>": 44428, "diagnose</w>": 44429, "msg": 44430, "wess</w>": 44431, "ticketing</w>": 44432, "onee</w>": 44433, "nswpol</w>": 44434, "eup": 44435, "autopsy</w>": 44436, "adityanath</w>": 44437, "sundown</w>": 44438, "riverfront</w>": 44439, "siya</w>": 44440, "pis</w>": 44441, "hierarchy</w>": 44442, "durango</w>": 44443, "dijk</w>": 44444, "renshaw</w>": 44445, "heaps</w>": 44446, "epidemi": 44447, "davidbowie</w>": 44448, "internetof": 44449, "ddi</w>": 44450, "nationality</w>": 44451, "mbar": 44452, "airy</w>": 44453, "winder": 44454, "walia</w>": 44455, "elliott": 44456, "cx": 44457, "bavarian</w>": 44458, "platt": 44459, "antw": 44460, "wiwx</w>": 44461, "softer</w>": 44462, "neha</w>": 44463, "heller</w>": 44464, "thand": 44465, "daniela</w>": 44466, "boast</w>": 44467, "degradation</w>": 44468, "ðŁĴ¦ðŁĴ¦</w>": 44469, "transforming": 44470, "mane": 44471, "avut</w>": 44472, "ðŁĺĪðŁĺĪ</w>": 44473, "voter": 44474, "thee": 44475, "tate": 44476, "puff": 44477, "indoor": 44478, "soproud</w>": 44479, "boyce</w>": 44480, "borisjohnson</w>": 44481, "waitin</w>": 44482, "immunology</w>": 44483, "ðŁıĨðŁıĨðŁıĨ</w>": 44484, "âĿĮ": 44485, "streetfood</w>": 44486, "lizasober": 44487, "cavalier</w>": 44488, "celia</w>": 44489, "needle": 44490, "motoring</w>": 44491, "gato</w>": 44492, ",)</w>": 44493, "rade": 44494, "harvest": 44495, "tms</w>": 44496, "jarpad</w>": 44497, "oney": 44498, "airmen</w>": 44499, "vre": 44500, "impairment</w>": 44501, "abhishek</w>": 44502, "snoop": 44503, "lant</w>": 44504, "famously</w>": 44505, "blou": 44506, "sze": 44507, "gander</w>": 44508, "untouch": 44509, "tuf": 44510, "deejay</w>": 44511, "collateral</w>": 44512, "bind</w>": 44513, "ðŁļ©</w>": 44514, "pinning</w>": 44515, "icn</w>": 44516, "';</w>": 44517, "theeconomist</w>": 44518, "ultram": 44519, "worldwaterday</w>": 44520, "tipoff</w>": 44521, "thei": 44522, "feeders</w>": 44523, "campaign": 44524, "scumb": 44525, "dayweekend</w>": 44526, "yom</w>": 44527, "pedic</w>": 44528, "hough</w>": 44529, "psv</w>": 44530, "plin": 44531, "onde</w>": 44532, "bostonmarathon</w>": 44533, "azzy</w>": 44534, "*_*</w>": 44535, "conley</w>": 44536, "thiago</w>": 44537, "hooo</w>": 44538, "galerie</w>": 44539, "lucid</w>": 44540, "jett</w>": 44541, "glitz</w>": 44542, "finalfantasy</w>": 44543, "achievers</w>": 44544, "yung": 44545, "peregrine</w>": 44546, "ophi": 44547, "dames</w>": 44548, "biomar": 44549, "âĺĢï¸ıâĺĢï¸ı": 44550, "skc</w>": 44551, "lics</w>": 44552, "flank</w>": 44553, "arrahman</w>": 44554, "hoof</w>": 44555, "upholstery</w>": 44556, "tats</w>": 44557, "woz": 44558, "Â¿": 44559, "snoring</w>": 44560, "raer</w>": 44561, "lju": 44562, "apd</w>": 44563, "plating</w>": 44564, "kanu</w>": 44565, "imation</w>": 44566, "fragrances</w>": 44567, "mra": 44568, "moray</w>": 44569, "mott": 44570, "immuni": 44571, "hearties</w>": 44572, "bhopal</w>": 44573, "timers</w>": 44574, "gata</w>": 44575, "colorway</w>": 44576, "carnation</w>": 44577, "winget</w>": 44578, "sighs</w>": 44579, "sville": 44580, "optimist</w>": 44581, "chateau": 44582, "olympians</w>": 44583, "cio": 44584, "singersongwriter</w>": 44585, "nyo</w>": 44586, "fibers</w>": 44587, "burch</w>": 44588, "agro</w>": 44589, "milne</w>": 44590, "igbo</w>": 44591, "cramer</w>": 44592, "ationals</w>": 44593, "danube</w>": 44594, "padma": 44595, "normani</w>": 44596, "enforced</w>": 44597, "breck": 44598, "boehner</w>": 44599, "arden": 44600, "surrendered</w>": 44601, "prosthetic</w>": 44602, "oma": 44603, "hailed</w>": 44604, "calculations</w>": 44605, "wfa": 44606, "bib": 44607, "fcblive</w>": 44608, "fonda</w>": 44609, "westcoast</w>": 44610, "quests</w>": 44611, "friendly": 44612, "towie</w>": 44613, "fitch</w>": 44614, "balot": 44615, "stardom</w>": 44616, "scratching</w>": 44617, "hosa</w>": 44618, "thika</w>": 44619, "oven": 44620, "stroke": 44621, "outpost</w>": 44622, "pharmaceuticals</w>": 44623, "hikari</w>": 44624, "muy</w>": 44625, "afd</w>": 44626, "fallontonight</w>": 44627, "squat": 44628, "oru": 44629, "drained</w>": 44630, "chocolat</w>": 44631, "ë¯¼</w>": 44632, "worths</w>": 44633, "rib": 44634, "muj": 44635, "thats": 44636, "residente": 44637, "itel</w>": 44638, "boost": 44639, "migos</w>": 44640, "mulled</w>": 44641, "laa</w>": 44642, "etsyshop</w>": 44643, "donkeys</w>": 44644, "mek</w>": 44645, "ptc</w>": 44646, "flinders</w>": 44647, "ehs</w>": 44648, "rohit": 44649, "muir": 44650, "gad</w>": 44651, "compositions</w>": 44652, "åĨĻ": 44653, "combustion</w>": 44654, "ikh": 44655, "yemeni</w>": 44656, "waved</w>": 44657, "garci": 44658, "akos</w>": 44659, "oods</w>": 44660, "fusion": 44661, "seque": 44662, "slan": 44663, "plur": 44664, "kicchasu": 44665, "shenando": 44666, "sams</w>": 44667, "worlden": 44668, "horowitz</w>": 44669, "withme</w>": 44670, "microbes</w>": 44671, "kki</w>": 44672, "ðŁĴĶðŁĴĶ": 44673, "wsu": 44674, "patchwork</w>": 44675, "freer": 44676, "yaki</w>": 44677, "theart": 44678, "symbolism</w>": 44679, "miler</w>": 44680, "btn</w>": 44681, "mabu": 44682, "sidekick</w>": 44683, "motivates</w>": 44684, "sagitt": 44685, "naturals</w>": 44686, "serviced</w>": 44687, "psori": 44688, "paola</w>": 44689, "quig": 44690, "ibadan</w>": 44691, "giggs</w>": 44692, "ë³": 44693, "scientology</w>": 44694, "sioux": 44695, "salamat</w>": 44696, "dres</w>": 44697, "cadbury</w>": 44698, "dhawan</w>": 44699, "ciÃ³n</w>": 44700, "_'</w>": 44701, "swapping</w>": 44702, "mariska</w>": 44703, "jamesbond</w>": 44704, "explosives</w>": 44705, "ayles": 44706, "afer": 44707, "sagu": 44708, "censor</w>": 44709, "toma</w>": 44710, "jefferson": 44711, "ringed</w>": 44712, "partist</w>": 44713, "irresponsible</w>": 44714, "aguilar</w>": 44715, "vacay</w>": 44716, "equitable</w>": 44717, "altrincham</w>": 44718, "acur": 44719, "manish</w>": 44720, "germin": 44721, "schooled</w>": 44722, "putter</w>": 44723, "edad</w>": 44724, "naval": 44725, "toasty</w>": 44726, "solareclipse</w>": 44727, "dishu</w>": 44728, "coyne</w>": 44729, "acco</w>": 44730, "muck": 44731, "maran": 44732, "elos</w>": 44733, "lender</w>": 44734, "croix</w>": 44735, "worthless</w>": 44736, "haber": 44737, "gunmen</w>": 44738, "ðŁįĵ": 44739, "zenith</w>": 44740, "tenders</w>": 44741, "hurst": 44742, "holtz</w>": 44743, "italians</w>": 44744, "carlow</w>": 44745, "ucd": 44746, "characteristic</w>": 44747, "bung</w>": 44748, "avl": 44749, "uth</w>": 44750, "sasia</w>": 44751, "rsl</w>": 44752, "redman</w>": 44753, "neighboring</w>": 44754, "greenpeace</w>": 44755, "stips</w>": 44756, "followparty</w>": 44757, "ygk</w>": 44758, "enos</w>": 44759, "omnibus</w>": 44760, "naissance</w>": 44761, "chrissy": 44762, "secure": 44763, "callback</w>": 44764, "jihoon</w>": 44765, "memory": 44766, "blocker</w>": 44767, "lanta</w>": 44768, "daffodils</w>": 44769, "bilt": 44770, "fferty</w>": 44771, "faust</w>": 44772, "iec</w>": 44773, "nipples</w>": 44774, "sog</w>": 44775, "mnd</w>": 44776, "jaguar": 44777, "boldly</w>": 44778, "abpoli</w>": 44779, "proposition</w>": 44780, "gunsense</w>": 44781, "evansville</w>": 44782, "cutters</w>": 44783, "wego": 44784, "doun</w>": 44785, "dox": 44786, "stallions</w>": 44787, "kaj": 44788, "shippers</w>": 44789, "jawa": 44790, "volo</w>": 44791, "leven": 44792, "paprika</w>": 44793, "kovich</w>": 44794, "jordi</w>": 44795, "inductees</w>": 44796, "appalling</w>": 44797, "dialysis</w>": 44798, "alleviate</w>": 44799, "âĢĶâĢĶ</w>": 44800, "pieter</w>": 44801, "midwi": 44802, "qtr</w>": 44803, "juliette</w>": 44804, "intermission</w>": 44805, "hawks": 44806, "actment</w>": 44807, "oneill</w>": 44808, "klin": 44809, "vamps</w>": 44810, "famous": 44811, "could": 44812, "automobi": 44813, "daan</w>": 44814, "westend</w>": 44815, "ellip": 44816, "nhc</w>": 44817, "melanch": 44818, "webseries</w>": 44819, "tongue": 44820, "snatched</w>": 44821, "smyth</w>": 44822, "tangible</w>": 44823, "sli</w>": 44824, "easing</w>": 44825, "barstool": 44826, "overlay</w>": 44827, "affordability</w>": 44828, "tinged</w>": 44829, "teras</w>": 44830, "ayush": 44831, "wannaone</w>": 44832, "rhine</w>": 44833, "dana": 44834, "shana</w>": 44835, "kendal</w>": 44836, "fertile</w>": 44837, "wir</w>": 44838, "repleni": 44839, "larvae</w>": 44840, "isro</w>": 44841, "convos</w>": 44842, "abbrevi": 44843, "ucc": 44844, "hungry": 44845, "burrows</w>": 44846, "ager": 44847, "navi</w>": 44848, "matin</w>": 44849, "duper</w>": 44850, "cern</w>": 44851, "madon": 44852, "ķï¸ı</w>": 44853, "éģ": 44854, "tups</w>": 44855, "hyatt": 44856, "shep</w>": 44857, "fridaynight": 44858, "wiser</w>": 44859, "heidi": 44860, "hatton</w>": 44861, "pgh": 44862, "fountain": 44863, "wristbands</w>": 44864, "ahmadiyya</w>": 44865, "aerial": 44866, "subscribed</w>": 44867, "solos</w>": 44868, "mace</w>": 44869, "slayed</w>": 44870, "forfe": 44871, "dulce</w>": 44872, "christmass": 44873, "arunjaitley</w>": 44874, "violate</w>": 44875, "obstru": 44876, "nieces</w>": 44877, "wvu": 44878, "idyl": 44879, "faze</w>": 44880, "preserves</w>": 44881, "infringe": 44882, "premiers</w>": 44883, "intervals</w>": 44884, "agency": 44885, "(Â©</w>": 44886, "standalone</w>": 44887, "dimes</w>": 44888, "boer": 44889, "parameters</w>": 44890, "getit": 44891, "ðŁĺĺðŁĺĺðŁĺĺðŁĺĺ</w>": 44892, "tulane</w>": 44893, "forgiven</w>": 44894, "scoll</w>": 44895, "mbps</w>": 44896, "smashbros</w>": 44897, "robbi": 44898, "primavera</w>": 44899, "alist": 44900, "ghostly</w>": 44901, "ayat": 44902, "yeats</w>": 44903, "impressionist</w>": 44904, "earphones</w>": 44905, "caulfield</w>": 44906, "waikiki</w>": 44907, "salute": 44908, "scou": 44909, "muay": 44910, "louisvuitton</w>": 44911, "bakhta": 44912, "adog</w>": 44913, "inventions</w>": 44914, "hurd</w>": 44915, "foreclo": 44916, "streamline</w>": 44917, "thalaivar</w>": 44918, "chsnews</w>": 44919, "willard</w>": 44920, "tsn": 44921, "europarl</w>": 44922, "crusher</w>": 44923, "mysore</w>": 44924, "grower</w>": 44925, "raping</w>": 44926, "patti": 44927, "gden</w>": 44928, "smw</w>": 44929, "mufti</w>": 44930, "kidman</w>": 44931, "abr": 44932, "sounders</w>": 44933, "skeptical</w>": 44934, "ðŁĶİ</w>": 44935, "sundar": 44936, "ime": 44937, "ferg</w>": 44938, "featherweight</w>": 44939, "arlington": 44940, "pasqu": 44941, "agazine</w>": 44942, "wearable": 44943, "natic</w>": 44944, "mcclure</w>": 44945, "intermitt": 44946, "horde</w>": 44947, "sixties</w>": 44948, "carte</w>": 44949, "bhav": 44950, "zeal": 44951, "experiential</w>": 44952, "adorned</w>": 44953, "sommer</w>": 44954, "enote</w>": 44955, "hypothesis</w>": 44956, "stinky</w>": 44957, "proto</w>": 44958, "deadlines</w>": 44959, "vogel</w>": 44960, "musings</w>": 44961, "moncton</w>": 44962, "guter": 44963, "fle</w>": 44964, "acion": 44965, "voiceof": 44966, "tasha</w>": 44967, "inhabitants</w>": 44968, "typeface</w>": 44969, "sba</w>": 44970, "btsx": 44971, "ðŁĶĴ</w>": 44972, "worx</w>": 44973, "uhc</w>": 44974, "joko": 44975, "cellars</w>": 44976, "goro</w>": 44977, "continuum</w>": 44978, "...&</w>": 44979, "weathercee</w>": 44980, "hap": 44981, "srk": 44982, "risers</w>": 44983, "lonelyplanet</w>": 44984, "unnamed</w>": 44985, "coeur</w>": 44986, "ðŁįĮ</w>": 44987, "theworld": 44988, "ilike": 44989, "fasten": 44990, "amigo</w>": 44991, "riba</w>": 44992, "ramaphosa</w>": 44993, "staffers</w>": 44994, "hadley</w>": 44995, "??\"</w>": 44996, "fiore</w>": 44997, "salut": 44998, "huff</w>": 44999, "bezos</w>": 45000, "Ñĭ": 45001, "rader</w>": 45002, "kamala</w>": 45003, "inline</w>": 45004, "fillers</w>": 45005, "umatic</w>": 45006, "allin": 45007, "shatter</w>": 45008, "rein</w>": 45009, "oku": 45010, "chases</w>": 45011, "flagged</w>": 45012, "babymetal</w>": 45013, "waterstones</w>": 45014, "tsb</w>": 45015, "cutout</w>": 45016, "ophel": 45017, "aama": 45018, "rockabilly</w>": 45019, "stolic</w>": 45020, "jetblue</w>": 45021, "ichick</w>": 45022, "downton": 45023, "uzbekistan</w>": 45024, "patna</w>": 45025, "laq</w>": 45026, "grange": 45027, ")_/": 45028, "subsidi": 45029, "scp</w>": 45030, "newscast</w>": 45031, "itsa": 45032, "tweetyour": 45033, "emor": 45034, "archaeologists</w>": 45035, "unification</w>": 45036, "porta</w>": 45037, "qx</w>": 45038, "protectors</w>": 45039, "prohib": 45040, "charisma</w>": 45041, "cartag": 45042, "renfre": 45043, "sculpt</w>": 45044, "guwahati</w>": 45045, "dema</w>": 45046, "boop</w>": 45047, "unfpa</w>": 45048, "dexter": 45049, "layla</w>": 45050, "alleges</w>": 45051, "soups</w>": 45052, "neveragain</w>": 45053, "lys</w>": 45054, "calc</w>": 45055, "baroness</w>": 45056, "visualize</w>": 45057, "gerber</w>": 45058, "absorbed</w>": 45059, "iers</w>": 45060, "ahan": 45061, "fontein</w>": 45062, "detectors</w>": 45063, "verstappen</w>": 45064, "svc</w>": 45065, "formulated</w>": 45066, "acdc</w>": 45067, "lix</w>": 45068, "incompetent</w>": 45069, "bhk</w>": 45070, "lourdes</w>": 45071, "waterhouse</w>": 45072, "snowed</w>": 45073, "appreciative</w>": 45074, "sigma": 45075, "lizasoberano</w>": 45076, "penned</w>": 45077, "paycheck</w>": 45078, "tallinn</w>": 45079, "fancafe</w>": 45080, "parisi": 45081, "avalley</w>": 45082, "vig</w>": 45083, "rufc</w>": 45084, "hardship</w>": 45085, "socute</w>": 45086, "poise</w>": 45087, "ì¹": 45088, "rothschild</w>": 45089, "kly</w>": 45090, "????????": 45091, "lhp</w>": 45092, "ilay": 45093, "fhs</w>": 45094, "amad": 45095, "ideals</w>": 45096, "bradbury</w>": 45097, "balboa</w>": 45098, "nicot": 45099, "kidnap</w>": 45100, "wolve": 45101, "tasmanian</w>": 45102, "opt": 45103, "matthias</w>": 45104, "ãĥ³ãĤ": 45105, "supermarkets</w>": 45106, "mylittlepony</w>": 45107, "melee</w>": 45108, "lister</w>": 45109, "groun": 45110, "fedora</w>": 45111, "kindness": 45112, "enen</w>": 45113, "brahms</w>": 45114, "Â¯\\_(</w>": 45115, "roswell</w>": 45116, "marlene</w>": 45117, "icu": 45118, "reformation</w>": 45119, "orail</w>": 45120, "hebrides</w>": 45121, "disparities</w>": 45122, "terracotta</w>": 45123, "swallows</w>": 45124, "reid": 45125, "influencing</w>": 45126, "fluor": 45127, "dene</w>": 45128, "tumour</w>": 45129, "blondes</w>": 45130, "thunderbird</w>": 45131, "sheva</w>": 45132, "mogadishu</w>": 45133, "kab</w>": 45134, "creeps</w>": 45135, "iving</w>": 45136, "eneed": 45137, "annoy</w>": 45138, "âĶĢ": 45139, "intrigue</w>": 45140, "enquiry</w>": 45141, "araj</w>": 45142, "tural": 45143, "kubernetes</w>": 45144, "endlessly</w>": 45145, "dividends</w>": 45146, "tora</w>": 45147, "tish": 45148, "commemorates</w>": 45149, "unra": 45150, "trib</w>": 45151, "ponty": 45152, "nem</w>": 45153, "dissent</w>": 45154, "brewingco</w>": 45155, "ðŁĺ½</w>": 45156, "normali": 45157, "biof": 45158, "(...</w>": 45159, "chillen</w>": 45160, "ì£¼": 45161, "mellon</w>": 45162, "avis": 45163, "mccormack</w>": 45164, "ingra": 45165, "enriched</w>": 45166, "customerexperience</w>": 45167, "testosterone</w>": 45168, "snug</w>": 45169, "setti</w>": 45170, "geronimo</w>": 45171, "inquirer</w>": 45172, "breaches</w>": 45173, "verything</w>": 45174, "blooming": 45175, "mura": 45176, "dispos": 45177, "bide</w>": 45178, "deva</w>": 45179, "shadesof": 45180, "intrin": 45181, "shev": 45182, "sven</w>": 45183, "nayanthara</w>": 45184, "ganesha</w>": 45185, "cws</w>": 45186, "berta</w>": 45187, "labelled</w>": 45188, "useum</w>": 45189, "nicknamed</w>": 45190, "mahan": 45191, "caruso</w>": 45192, "apur</w>": 45193, "ðŁĳĨ</w>": 45194, "wq": 45195, "orphanage</w>": 45196, "discarded</w>": 45197, "magnu": 45198, "lue</w>": 45199, "jeon": 45200, "bridgeport</w>": 45201, "pacing</w>": 45202, "mercury": 45203, "(ðŁĵ¸</w>": 45204, "marxist</w>": 45205, "amphibious</w>": 45206, "transplantation</w>": 45207, "stitching</w>": 45208, "thenburg</w>": 45209, "gradual</w>": 45210, "ãĤĮ</w>": 45211, "roft</w>": 45212, "mails</w>": 45213, "inec</w>": 45214, "guyana</w>": 45215, "doppelg": 45216, "vero": 45217, "rewrite</w>": 45218, "headless</w>": 45219, "harbaugh</w>": 45220, "gateway": 45221, "carsforsale</w>": 45222, "swi</w>": 45223, "stis</w>": 45224, "macht</w>": 45225, "unde</w>": 45226, "surabaya</w>": 45227, "stapleton</w>": 45228, "nurturing</w>": 45229, "milner</w>": 45230, "yao</w>": 45231, "lmaoooo</w>": 45232, "kosh</w>": 45233, "arsenal": 45234, "kame": 45235, "erry": 45236, "arroyo</w>": 45237, "dismisses</w>": 45238, "rubbed</w>": 45239, "rcb</w>": 45240, "lewd</w>": 45241, "dilu": 45242, "andor": 45243, "vide</w>": 45244, "urin": 45245, "intersec": 45246, "haar": 45247, "alb": 45248, "yearswith": 45249, "appleton</w>": 45250, "Ã©al</w>": 45251, "ullivan</w>": 45252, "succu": 45253, "monterrey</w>": 45254, "dmx</w>": 45255, "artemis</w>": 45256, "ronnie": 45257, "farmland</w>": 45258, "sfootball</w>": 45259, "grotto</w>": 45260, "anthi</w>": 45261, "ãĢģ</w>": 45262, "à®Ł</w>": 45263, "vidya</w>": 45264, "jimmyfallon</w>": 45265, "àµį</w>": 45266, "tzer</w>": 45267, "gravitational</w>": 45268, "wthr": 45269, "uhhh</w>": 45270, "ehr": 45271, "tinker": 45272, "tijuana</w>": 45273, "scranton</w>": 45274, "ramcharan</w>": 45275, "barclay</w>": 45276, "revan</w>": 45277, "msi</w>": 45278, "kap</w>": 45279, "wrs</w>": 45280, "wethenorth</w>": 45281, "toral</w>": 45282, "satu</w>": 45283, "grom</w>": 45284, "facep": 45285, "erickson</w>": 45286, "zyn": 45287, "sedge</w>": 45288, "oodle</w>": 45289, "spursofficial</w>": 45290, "dsp</w>": 45291, "sicilian</w>": 45292, "solihull</w>": 45293, "receivers</w>": 45294, "ladakh</w>": 45295, "hendrick</w>": 45296, "theri</w>": 45297, "presiding</w>": 45298, "mcguinness</w>": 45299, "litters</w>": 45300, "gunnar</w>": 45301, "ghoul</w>": 45302, "wib</w>": 45303, "ntv</w>": 45304, "karo</w>": 45305, "frock</w>": 45306, "blau": 45307, "amplify</w>": 45308, "allis</w>": 45309, "ullah</w>": 45310, "memoirs</w>": 45311, "khloe</w>": 45312, "interceptions</w>": 45313, "petday</w>": 45314, "looney</w>": 45315, "confin": 45316, "chay": 45317, "piyushgoyal</w>": 45318, "frequencies</w>": 45319, "utz</w>": 45320, "eventual</w>": 45321, "warmly</w>": 45322, "oblivion</w>": 45323, "anka</w>": 45324, "tait</w>": 45325, "âĿ¤ï¸ı.</w>": 45326, "directorial</w>": 45327, "rulers</w>": 45328, "princes": 45329, "muck</w>": 45330, "sturridge</w>": 45331, "deuce</w>": 45332, "abridged</w>": 45333, "baguette</w>": 45334, "uncles</w>": 45335, "pendu": 45336, "minding</w>": 45337, "forrester</w>": 45338, "avila</w>": 45339, "waller</w>": 45340, "wallstreet</w>": 45341, "mentor": 45342, "hino</w>": 45343, "highway": 45344, "cromwell</w>": 45345, "fanartfriday</w>": 45346, "mbi</w>": 45347, "coyle</w>": 45348, "ahi": 45349, "trove</w>": 45350, "spiegel</w>": 45351, "paytm</w>": 45352, "mcintosh</w>": 45353, "jansen</w>": 45354, "niti</w>": 45355, "nashville": 45356, "leno</w>": 45357, "leicestershire</w>": 45358, "legos</w>": 45359, "dict": 45360, "ðŁĵ½</w>": 45361, "spad": 45362, "beverlyhills</w>": 45363, "syrah</w>": 45364, "separates</w>": 45365, "zain</w>": 45366, "unfit</w>": 45367, "drags</w>": 45368, "tania</w>": 45369, "overflowing</w>": 45370, "hrithik</w>": 45371, "hawthorn</w>": 45372, "zani</w>": 45373, "macfar": 45374, "fide</w>": 45375, "totem</w>": 45376, "peds</w>": 45377, "fundamentally</w>": 45378, "calico</w>": 45379, "sinner</w>": 45380, "jÃ¤": 45381, "hilde": 45382, "dsd</w>": 45383, "tenay</w>": 45384, "tahit": 45385, "milf</w>": 45386, "lieb</w>": 45387, "informing</w>": 45388, "uplift</w>": 45389, "rael</w>": 45390, "mortgages</w>": 45391, "lect</w>": 45392, "iiii</w>": 45393, "guillaume</w>": 45394, "composites</w>": 45395, "oldsmobile</w>": 45396, "lend": 45397, "garth": 45398, "commish</w>": 45399, "baptized</w>": 45400, "scorpions</w>": 45401, "rucker</w>": 45402, "bringbackour": 45403, "alliance": 45404, "thalapathy": 45405, "tali</w>": 45406, "spans</w>": 45407, "eridge</w>": 45408, "witherspoon</w>": 45409, "linda": 45410, "skylar</w>": 45411, "korn": 45412, "homs</w>": 45413, "Äį": 45414, "silenced</w>": 45415, "caffe</w>": 45416, "arty": 45417, "distinguish</w>": 45418, "towed</w>": 45419, "pung</w>": 45420, "jessica": 45421, "earnest</w>": 45422, "beaufort</w>": 45423, "tama</w>": 45424, "studyabroad</w>": 45425, "sikhs</w>": 45426, "newbie</w>": 45427, "navratri</w>": 45428, "marble": 45429, "lounging</w>": 45430, "litter": 45431, "dalit</w>": 45432, "sosa</w>": 45433, "izes</w>": 45434, "grade": 45435, "compromising</w>": 45436, "triton</w>": 45437, "detta</w>": 45438, "vj": 45439, "chauffe": 45440, "spectral</w>": 45441, "powered": 45442, "montessori</w>": 45443, "articulate</w>": 45444, "halton</w>": 45445, "alco</w>": 45446, "yey</w>": 45447, "mntwins</w>": 45448, "acounty</w>": 45449, "ðŁĳıðŁı¾</w>": 45450, "âīĪ</w>": 45451, "madmen</w>": 45452, "kala": 45453, "grum": 45454, "chik</w>": 45455, "atis": 45456, "sume</w>": 45457, "akhtar</w>": 45458, "jobsearch</w>": 45459, "highlighter</w>": 45460, "boath": 45461, "âĦ¹</w>": 45462, "tarzan</w>": 45463, "lambo</w>": 45464, "âĽĦï¸ı</w>": 45465, "oxfam</w>": 45466, "dumpster</w>": 45467, "pretzels</w>": 45468, "macos</w>": 45469, "inclined</w>": 45470, "factual</w>": 45471, "advertisers</w>": 45472, "shui</w>": 45473, "puree</w>": 45474, "mlpfi": 45475, "antidote</w>": 45476, "capo</w>": 45477, "pastr": 45478, "mercado</w>": 45479, "button": 45480, "armin": 45481, "agg</w>": 45482, "lolla": 45483, "horribly</w>": 45484, "errands</w>": 45485, "christophe</w>": 45486, "timesnow</w>": 45487, "mondaymotiv": 45488, "liss": 45489, "scandals</w>": 45490, "mci</w>": 45491, "disproportion": 45492, "âĺİ</w>": 45493, "surpass</w>": 45494, "samaritan</w>": 45495, "sotho</w>": 45496, "purest</w>": 45497, "flatt": 45498, "triviatuesday</w>": 45499, "delectable</w>": 45500, "leopold</w>": 45501, "hermione</w>": 45502, "choudhary</w>": 45503, "enrich</w>": 45504, "Â¡Â¡</w>": 45505, "subsidiary</w>": 45506, "inequalities</w>": 45507, "bachelor": 45508, "autoimmune</w>": 45509, "lakota</w>": 45510, "ihop</w>": 45511, "adjec": 45512, "thesimpsons</w>": 45513, "shes": 45514, "sek": 45515, "gretchen</w>": 45516, "upstream</w>": 45517, "hinakhan</w>": 45518, "copernic": 45519, "xtina</w>": 45520, "lug</w>": 45521, "toughness</w>": 45522, "ead</w>": 45523, "clipped</w>": 45524, "bius</w>": 45525, "slv</w>": 45526, "fahren": 45527, "deepak</w>": 45528, "cau</w>": 45529, "xan": 45530, "immature</w>": 45531, "digni": 45532, "bobs</w>": 45533, "shredding</w>": 45534, "buttery</w>": 45535, "accommodations</w>": 45536, "deven": 45537, "chunks</w>": 45538, "superleague</w>": 45539, "skybet": 45540, "kildare</w>": 45541, "jeet": 45542, "ëį": 45543, "cek</w>": 45544, "wrecks</w>": 45545, "propane</w>": 45546, "ohl": 45547, "tbd</w>": 45548, "quoi</w>": 45549, "trumpp": 45550, "mimo": 45551, "reluctant</w>": 45552, "verne</w>": 45553, "oic</w>": 45554, "magh": 45555, "arnau": 45556, "sever</w>": 45557, "lidge</w>": 45558, "stairway</w>": 45559, "kicchasudeep</w>": 45560, "ðŁĶº</w>": 45561, "machining</w>": 45562, "aamaadmi": 45563, "oti</w>": 45564, "cda</w>": 45565, "alit": 45566, "pany</w>": 45567, "installs</w>": 45568, "acct</w>": 45569, "eshop</w>": 45570, "diem</w>": 45571, "hardwell</w>": 45572, "fulfillment</w>": 45573, "scafe</w>": 45574, "quack</w>": 45575, "extracts</w>": 45576, "sweetened</w>": 45577, "fighton</w>": 45578, "fdi</w>": 45579, "dinger</w>": 45580, "waltham</w>": 45581, "usur": 45582, "referees</w>": 45583, "seokjin</w>": 45584, "grann</w>": 45585, "afrin</w>": 45586, "thn</w>": 45587, "schaf": 45588, "parcels</w>": 45589, "betis</w>": 45590, "amarine</w>": 45591, "noman": 45592, "khtar</w>": 45593, "moritz</w>": 45594, "coupling</w>": 45595, "barons</w>": 45596, "ðŁĲ¸": 45597, "Ã¸</w>": 45598, "slp</w>": 45599, "sadler</w>": 45600, "xander</w>": 45601, "triad</w>": 45602, "mcmillan</w>": 45603, "khz</w>": 45604, "dividing</w>": 45605, "ìĹĳìĨĮ</w>": 45606, "daryl": 45607, "zedd</w>": 45608, "leys": 45609, "plaques</w>": 45610, "fluori": 45611, "tipperary</w>": 45612, "onnell</w>": 45613, "didier</w>": 45614, "langford</w>": 45615, "imc</w>": 45616, "thesun</w>": 45617, "birdies</w>": 45618, "archa": 45619, "yessss</w>": 45620, "tdi</w>": 45621, "daria</w>": 45622, "candace</w>": 45623, "altam": 45624, "palaces</w>": 45625, "chit</w>": 45626, "santam": 45627, "eventful</w>": 45628, "bookof": 45629, "adb</w>": 45630, "monstax</w>": 45631, "creole</w>": 45632, "coel": 45633, "âĸ½": 45634, "wearen": 45635, "stennis</w>": 45636, "sheath</w>": 45637, "atism</w>": 45638, "groningen</w>": 45639, "mlpfim</w>": 45640, "lepre": 45641, "wrongly</w>": 45642, "rspca</w>": 45643, "rendezvous</w>": 45644, "acknowledging</w>": 45645, "pelvic</w>": 45646, "solicitor</w>": 45647, "slays</w>": 45648, "nuestra</w>": 45649, "lod": 45650, "islander</w>": 45651, "feroci": 45652, "fashionshow</w>": 45653, "rass</w>": 45654, "dgeon</w>": 45655, "adolescents</w>": 45656, "smashes</w>": 45657, "negligence</w>": 45658, "grateful": 45659, "vedere</w>": 45660, "swoop</w>": 45661, "ingl": 45662, "apolice</w>": 45663, "vandalism</w>": 45664, "gann</w>": 45665, "joao</w>": 45666, "disupdates</w>": 45667, "zimbabwe": 45668, "underage</w>": 45669, "radiance</w>": 45670, "wof": 45671, "bourgeo": 45672, "plas": 45673, "crani": 45674, "ghue</w>": 45675, "wreckem</w>": 45676, "warrants</w>": 45677, "reform": 45678, "jimmie</w>": 45679, "atwood</w>": 45680, "ysl</w>": 45681, "neilhimself</w>": 45682, "lbj</w>": 45683, "iman": 45684, "tanto</w>": 45685, "noisse": 45686, "verbs</w>": 45687, "equipo</w>": 45688, "altogether</w>": 45689, "mament</w>": 45690, "lice</w>": 45691, "douglass</w>": 45692, "tierney</w>": 45693, "primed</w>": 45694, "jhal": 45695, "furnitu": 45696, "brazili": 45697, "vill</w>": 45698, "pastels</w>": 45699, "nison</w>": 45700, "uff</w>": 45701, "paralysis</w>": 45702, "jaye</w>": 45703, "impo": 45704, "ðŁĳģ</w>": 45705, "strategically</w>": 45706, "pakistanis</w>": 45707, "wassup</w>": 45708, "superbike</w>": 45709, "thanku</w>": 45710, "truelove</w>": 45711, "shaikh</w>": 45712, "israelis</w>": 45713, "vip": 45714, "tog</w>": 45715, "lien</w>": 45716, "laker": 45717, "greyhounds</w>": 45718, "culars</w>": 45719, "bianchi</w>": 45720, "balotelli</w>": 45721, "arran</w>": 45722, "loos</w>": 45723, "strates</w>": 45724, "hebron</w>": 45725, "arvo</w>": 45726, "sunderland": 45727, "theal": 45728, "tombstone</w>": 45729, "sandman</w>": 45730, "cpac</w>": 45731, "thanksgiving": 45732, "lovehim</w>": 45733, "latino": 45734, "anin</w>": 45735, "akaif</w>": 45736, "ĭãĤ": 45737, "torquay</w>": 45738, "diest</w>": 45739, "allianz</w>": 45740, "ðŁĺķ": 45741, "golfclub</w>": 45742, "cllr": 45743, "walcott</w>": 45744, "schnau": 45745, "prompted</w>": 45746, "nominating</w>": 45747, "lennox</w>": 45748, "valet</w>": 45749, "monro": 45750, "mayward": 45751, "eph": 45752, "ðŁĶĶ</w>": 45753, "interoper": 45754, "rda</w>": 45755, "reflex": 45756, "armchair</w>": 45757, "ê°ķ": 45758, "stripper</w>": 45759, "porti": 45760, "pharm</w>": 45761, "hamza</w>": 45762, "nireland</w>": 45763, "neue</w>": 45764, "hpv</w>": 45765, "portfoli": 45766, "sunburn</w>": 45767, "frisbee</w>": 45768, "beal</w>": 45769, "baptiste</w>": 45770, "xh</w>": 45771, "tym</w>": 45772, "prati": 45773, "overs": 45774, "hazrat</w>": 45775, "desert": 45776, "derry": 45777, "usky</w>": 45778, "emmett</w>": 45779, "acharya</w>": 45780, ")_/Â¯</w>": 45781, "shud</w>": 45782, "maya": 45783, "hamill": 45784, "raim": 45785, "nrc</w>": 45786, "fittings</w>": 45787, "curvy</w>": 45788, "ðŁıĩ": 45789, "sterling": 45790, "à¥Ģ": 45791, "walkin</w>": 45792, "shortcuts</w>": 45793, "milly</w>": 45794, "astur": 45795, "alphabe": 45796, "pli</w>": 45797, "pez": 45798, "missyou</w>": 45799, "radford</w>": 45800, "mlg</w>": 45801, "taeyang</w>": 45802, "notjustlakes</w>": 45803, "dumps</w>": 45804, "serendip": 45805, "leur</w>": 45806, "raving</w>": 45807, "ester</w>": 45808, "depriv": 45809, "abscbn</w>": 45810, "ðŁĳĩðŁı»": 45811, "scarcity</w>": 45812, "ocr": 45813, "meanings</w>": 45814, "capt": 45815, "dahl": 45816, "fermentation</w>": 45817, "brioche</w>": 45818, "towin</w>": 45819, "outlander": 45820, "massimo</w>": 45821, "encro": 45822, "ðŁ¥³": 45823, "built": 45824, "potam": 45825, "kiri</w>": 45826, "tmw</w>": 45827, "monitored</w>": 45828, "kites</w>": 45829, "peoplesvote</w>": 45830, "grayson": 45831, "íģ¬</w>": 45832, "afrika</w>": 45833, "adies</w>": 45834, "ivote": 45835, "gyne": 45836, "gannon</w>": 45837, "dix": 45838, "cmc</w>": 45839, "oural</w>": 45840, "foxandfriends</w>": 45841, "beli</w>": 45842, "igne</w>": 45843, "glan</w>": 45844, "katrinakaif</w>": 45845, "copolitics</w>": 45846, "qualitative</w>": 45847, "psi": 45848, "lucci</w>": 45849, "discoura": 45850, "âĺ®</w>": 45851, "kelli": 45852, "gautam": 45853, "caracas</w>": 45854, "realest</w>": 45855, "pula": 45856, "inus</w>": 45857, "hilltop</w>": 45858, "makeaw": 45859, "attenborough</w>": 45860, "twy": 45861, "rarity</w>": 45862, "peckham</w>": 45863, "mahon": 45864, "cornelius</w>": 45865, "clinicians</w>": 45866, "tonline</w>": 45867, "tbi</w>": 45868, "paradise": 45869, "kasi": 45870, "inevit": 45871, "freshness</w>": 45872, "collingwood</w>": 45873, "lunatic</w>": 45874, "defense": 45875, "copd</w>": 45876, "infra</w>": 45877, "wainwright</w>": 45878, "sainsbury</w>": 45879, "alabam": 45880, "tema</w>": 45881, "laco": 45882, "checker</w>": 45883, "relegated</w>": 45884, "trent": 45885, "stalks</w>": 45886, "huffpost": 45887, "bhubaneswar</w>": 45888, "astral</w>": 45889, "shareyour": 45890, "primrose</w>": 45891, "hime</w>": 45892, "catan</w>": 45893, "endment</w>": 45894, "endow": 45895, "clemens</w>": 45896, "maloney</w>": 45897, "hilary": 45898, "gametime</w>": 45899, "denise": 45900, "collaborators</w>": 45901, "bwo": 45902, "radicals</w>": 45903, "guetta</w>": 45904, "icion": 45905, "aua</w>": 45906, "snapmatic</w>": 45907, "satchel</w>": 45908, "excavation</w>": 45909, "baseman</w>": 45910, "sÃ£o</w>": 45911, "gnation</w>": 45912, "feld": 45913, "survey": 45914, "shahzad</w>": 45915, "mast": 45916, "anirudhofficial</w>": 45917, "trucker</w>": 45918, "otago</w>": 45919, "geograph": 45920, "ethel</w>": 45921, "âļ¡ï¸ıâļ¡ï¸ı</w>": 45922, "sver": 45923, "mutt</w>": 45924, "internetofthings</w>": 45925, "anchored</w>": 45926, "whouse</w>": 45927, "bangla</w>": 45928, "balmain</w>": 45929, "ç¹ĭãģ": 45930, "breakfa": 45931, "áĢ": 45932, "twister</w>": 45933, "tetris</w>": 45934, "cav</w>": 45935, "stags</w>": 45936, "gz": 45937, "aub</w>": 45938, "stormed</w>": 45939, "helens</w>": 45940, "yarmouth</w>": 45941, "stasy</w>": 45942, "gustavo</w>": 45943, "cosc": 45944, "vinson</w>": 45945, "upp</w>": 45946, "scricket</w>": 45947, "assumptions</w>": 45948, "appe</w>": 45949, "nuh</w>": 45950, "uer</w>": 45951, "premise</w>": 45952, "naga": 45953, "eamon": 45954, "coronary</w>": 45955, "naf</w>": 45956, "northside</w>": 45957, "elmer</w>": 45958, "rotar": 45959, "outlining</w>": 45960, "elf": 45961, "resurg": 45962, "katelyn</w>": 45963, "incan": 45964, "hysteria</w>": 45965, "cee": 45966, "ambani</w>": 45967, "prolly</w>": 45968, "ĮãĤĬãģ": 45969, "axes</w>": 45970, "sanjose</w>": 45971, "rembrandt</w>": 45972, "magpie</w>": 45973, "evenly</w>": 45974, "scorsese</w>": 45975, "quaint</w>": 45976, "fg": 45977, "bbuk</w>": 45978, "indianfootball</w>": 45979, "weareall": 45980, "spdwy</w>": 45981, "pisces</w>": 45982, "ecg</w>": 45983, "âĺħâĺħâĺħâĺħâĺħ</w>": 45984, "preorders</w>": 45985, ":|</w>": 45986, "nipple</w>": 45987, "salazar</w>": 45988, "jume": 45989, "jailbreak</w>": 45990, "minn": 45991, "bassett</w>": 45992, "zetta</w>": 45993, "jeffree": 45994, "adjun": 45995, "ticon</w>": 45996, "sandiego": 45997, "drinklocal</w>": 45998, "cholera</w>": 45999, "solicitors</w>": 46000, "obo": 46001, "compost": 46002, "nian</w>": 46003, "wra</w>": 46004, "treach": 46005, "icic": 46006, "professional": 46007, "delve</w>": 46008, "legate</w>": 46009, "historia</w>": 46010, "croissant</w>": 46011, "connoisse": 46012, "namo": 46013, "palliative</w>": 46014, "chemtrails</w>": 46015, "iority</w>": 46016, "globalwarming</w>": 46017, "comicart</w>": 46018, "behavioural</w>": 46019, "rested</w>": 46020, "lias</w>": 46021, "climates</w>": 46022, "ŁãģĦ</w>": 46023, "rutland</w>": 46024, "nourish</w>": 46025, "menopause</w>": 46026, "hotties</w>": 46027, "dementi": 46028, "vespa</w>": 46029, "melville</w>": 46030, "analogue</w>": 46031, "tzman</w>": 46032, "strung</w>": 46033, "imperfect</w>": 46034, "glare</w>": 46035, "circling</w>": 46036, "rosberg</w>": 46037, "reco</w>": 46038, "ocity": 46039, "loire</w>": 46040, "embe": 46041, "dossier</w>": 46042, "neel</w>": 46043, "nando": 46044, "mea": 46045, "galvani": 46046, "finesse</w>": 46047, "agp</w>": 46048, "berkeley": 46049, "asim</w>": 46050, "âĺºâĺº</w>": 46051, "quilted</w>": 46052, "ishere</w>": 46053, "unmatched</w>": 46054, "potion</w>": 46055, "forz": 46056, "atre": 46057, "selfies": 46058, "juliana</w>": 46059, "ðŁļ¶": 46060, "âĸº": 46061, "melton</w>": 46062, "âłĢâłĢâłĢâłĢâłĢâłĢâłĢâłĢ": 46063, "spinrilla</w>": 46064, "purcell</w>": 46065, "edp</w>": 46066, "atleti</w>": 46067, "tonyawards</w>": 46068, "raja": 46069, "progno": 46070, "molten</w>": 46071, "stuff": 46072, "pally</w>": 46073, "nobelprize</w>": 46074, "âĻ»ï¸ı</w>": 46075, "spiritual": 46076, "speake": 46077, "sasha": 46078, "brium</w>": 46079, "truss</w>": 46080, "criticize</w>": 46081, "assassinscreed</w>": 46082, "yoruba</w>": 46083, "ulo": 46084, "fireman</w>": 46085, "workinprogress</w>": 46086, "efcc</w>": 46087, "flares</w>": 46088, "robot": 46089, "hikers</w>": 46090, "cll": 46091, "shadowing</w>": 46092, "patsy</w>": 46093, "lehman</w>": 46094, "cns</w>": 46095, "å±": 46096, "guadal": 46097, "à±į</w>": 46098, "rape": 46099, "rhonda</w>": 46100, "parallels</w>": 46101, "sonja</w>": 46102, "language": 46103, "landings</w>": 46104, "zola</w>": 46105, "cramps</w>": 46106, "burning": 46107, "appraisal</w>": 46108, "jolla</w>": 46109, "hamm</w>": 46110, "kasa</w>": 46111, "gully</w>": 46112, "fgo</w>": 46113, "ulysses</w>": 46114, "ribe": 46115, "ðŁĴĦ": 46116, "ibu</w>": 46117, "etienne</w>": 46118, "briar</w>": 46119, "finely</w>": 46120, "combating</w>": 46121, "yql</w>": 46122, "gotham": 46123, "wechat</w>": 46124, "topaz</w>": 46125, "primaries</w>": 46126, "lse": 46127, "izz</w>": 46128, "hele</w>": 46129, "disponible</w>": 46130, "cystic</w>": 46131, "belichick</w>": 46132, "thrush</w>": 46133, "kansascity</w>": 46134, "geom": 46135, "solidi": 46136, "redbubble</w>": 46137, "bystand": 46138, "cambridgeshire</w>": 46139, "parfait</w>": 46140, "astle</w>": 46141, "owo</w>": 46142, "indore</w>": 46143, "stomping</w>": 46144, "smelly</w>": 46145, "ðŁ¤ĸ</w>": 46146, "locomo": 46147, "admitting</w>": 46148, "holme</w>": 46149, "clockwise</w>": 46150, "minsk</w>": 46151, "mcco": 46152, "forget": 46153, "evp</w>": 46154, "camra</w>": 46155, "abella</w>": 46156, "yotes</w>": 46157, "universityof": 46158, "mÃ©xico</w>": 46159, "silverado</w>": 46160, "ricket": 46161, "crombie</w>": 46162, "puj": 46163, "eradicate</w>": 46164, "delight": 46165, "ygo": 46166, "glamping</w>": 46167, "vica</w>": 46168, "duggan</w>": 46169, "counters</w>": 46170, "cfd</w>": 46171, "scour": 46172, "reactjs</w>": 46173, "puram</w>": 46174, "parasites</w>": 46175, "inki": 46176, "villen": 46177, "stella": 46178, "limbo</w>": 46179, "angas</w>": 46180, "kcr": 46181, "ðŁĴļðŁĴļðŁĴļ</w>": 46182, "vapori": 46183, "mumford</w>": 46184, "oligar": 46185, "à¼": 46186, "aloo</w>": 46187, "booties</w>": 46188, "adr</w>": 46189, "kelli</w>": 46190, "drummers</w>": 46191, "avici": 46192, "natureuk</w>": 46193, "ronal": 46194, "intrac": 46195, "unsplash</w>": 46196, "leche</w>": 46197, "goma</w>": 46198, "eline": 46199, "enviro</w>": 46200, "bionic</w>": 46201, "bueno</w>": 46202, "mik</w>": 46203, "avin": 46204, "starling</w>": 46205, "empowers</w>": 46206, "cakeday</w>": 46207, "boycot": 46208, "ðŁĴļðŁĴļ</w>": 46209, "ðŁĮ¸ðŁĮ¸": 46210, "vach": 46211, "mci": 46212, "fractures</w>": 46213, "geri</w>": 46214, "sking": 46215, "excluded</w>": 46216, "luce</w>": 46217, "jave": 46218, "iggy": 46219, "eviden": 46220, "akistan</w>": 46221, "awn</w>": 46222, "morals</w>": 46223, "lucifer": 46224, "haban": 46225, "tumbling</w>": 46226, "sundaymotivation</w>": 46227, "mosley</w>": 46228, "captainamerica</w>": 46229, "schicago</w>": 46230, "theone</w>": 46231, "motd</w>": 46232, "dts</w>": 46233, "ðŁĲ¼</w>": 46234, "repell": 46235, "iii": 46236, "locust</w>": 46237, "geospatial</w>": 46238, "mersey</w>": 46239, "immerse</w>": 46240, "descend</w>": 46241, "bernade": 46242, "js": 46243, "boatsales</w>": 46244, "winder</w>": 46245, "crank": 46246, "singleton</w>": 46247, "candidacy</w>": 46248, "bena</w>": 46249, "ðŁı»âĢį": 46250, "highlander</w>": 46251, "olt</w>": 46252, "kprs</w>": 46253, "healthylifestyle</w>": 46254, "fourteen</w>": 46255, "endthe": 46256, "ithaca</w>": 46257, "circulated</w>": 46258, "rans</w>": 46259, "prevalent</w>": 46260, "havas": 46261, "splendor</w>": 46262, "rooster": 46263, "kalamazoo</w>": 46264, "jewellers</w>": 46265, "ennedy</w>": 46266, "rousey</w>": 46267, "esy": 46268, "cannons</w>": 46269, "ornamental</w>": 46270, "////": 46271, "rendon</w>": 46272, "winne": 46273, "molding</w>": 46274, "eidmubarak</w>": 46275, "countess</w>": 46276, "simona</w>": 46277, "hawa</w>": 46278, "foes</w>": 46279, "duster</w>": 46280, "sbu</w>": 46281, "portray</w>": 46282, "marries</w>": 46283, "goodday": 46284, "choco": 46285, "achiever</w>": 46286, "ðŁĺ¹ðŁĺ¹": 46287, "preneur</w>": 46288, "tramp</w>": 46289, "tomi</w>": 46290, "nbat": 46291, "gardenchat</w>": 46292, "farrakhan</w>": 46293, "everglades</w>": 46294, "abru": 46295, "sousa</w>": 46296, "sece": 46297, "homeswee": 46298, "terrestrial</w>": 46299, "barit": 46300, "sridevi</w>": 46301, "olu</w>": 46302, "melinda</w>": 46303, "frick</w>": 46304, "candies</w>": 46305, "ðŁĺŃðŁĴķ</w>": 46306, "qureshi</w>": 46307, "familyfun</w>": 46308, "exorcist</w>": 46309, "cardinal": 46310, "nyt": 46311, "diesel": 46312, "cumulus</w>": 46313, "capricorn</w>": 46314, "siology</w>": 46315, "lorna</w>": 46316, "dougie</w>": 46317, "andie</w>": 46318, "supersport</w>": 46319, "cfl": 46320, "Ð¿ÑĢÐ¸": 46321, "sayang</w>": 46322, "peek": 46323, "à¸Ĭ</w>": 46324, "lobe</w>": 46325, "jem": 46326, "inglis</w>": 46327, "ggled</w>": 46328, "csn</w>": 46329, "amnesty": 46330, "chups</w>": 46331, "baes</w>": 46332, "sauer": 46333, "ðŁıĲ": 46334, "mongolian</w>": 46335, "enet</w>": 46336, "backstreet": 46337, "drilled</w>": 46338, "accessing</w>": 46339, "ceo": 46340, "bse</w>": 46341, "aiken</w>": 46342, "purr</w>": 46343, "worsen": 46344, "wheres": 46345, "wark": 46346, "testifying</w>": 46347, "buri": 46348, "blast": 46349, "awg</w>": 46350, "ðŁĵĭ</w>": 46351, "redefining</w>": 46352, "hearing": 46353, "uci": 46354, "cmp": 46355, "boni</w>": 46356, "tailoring</w>": 46357, "taji": 46358, "nocchi</w>": 46359, "emt</w>": 46360, "stephenking</w>": 46361, "neet</w>": 46362, "complains</w>": 46363, "campaigner</w>": 46364, "luciano</w>": 46365, "twilight": 46366, "tiesto</w>": 46367, "passports</w>": 46368, "floyd": 46369, "cathedr": 46370, "naked": 46371, "caregiver</w>": 46372, "bcoz</w>": 46373, "adecides</w>": 46374, "kuri": 46375, "lyk</w>": 46376, "braries</w>": 46377, "drenched</w>": 46378, "disclose</w>": 46379, "ðŁĴªðŁı½": 46380, "leblanc</w>": 46381, "jetty</w>": 46382, "garty</w>": 46383, "chipmun": 46384, "bsu</w>": 46385, "rhythmic</w>": 46386, "icz</w>": 46387, "frid</w>": 46388, "annex</w>": 46389, "amex</w>": 46390, "soloist</w>": 46391, "lancers</w>": 46392, "arrowhead</w>": 46393, "specification</w>": 46394, "simulated</w>": 46395, "nais": 46396, "inverte": 46397, "bowing</w>": 46398, "worship": 46399, "fz</w>": 46400, "aboss</w>": 46401, "shaq": 46402, "ì¶ķ": 46403, "challengers</w>": 46404, "anarch": 46405, "aamaadmiparty</w>": 46406, "ãħĭãħĭãħĭ</w>": 46407, "suffolk": 46408, "socorro</w>": 46409, "snell</w>": 46410, "cladding</w>": 46411, "absorbing</w>": 46412, "shawa</w>": 46413, "participates</w>": 46414, "ðŁįĶ": 46415, "bookstores</w>": 46416, "baku": 46417, "seaport</w>": 46418, "kojima</w>": 46419, "gaby</w>": 46420, "packard</w>": 46421, "electrician</w>": 46422, "letit": 46423, "mowing</w>": 46424, "fawad": 46425, "youngjae</w>": 46426, "hotmail</w>": 46427, "mening": 46428, "urie</w>": 46429, "intimacy</w>": 46430, "conti": 46431, ":\")</w>": 46432, "lifeisgood</w>": 46433, "inciner": 46434, "idri": 46435, "craziness</w>": 46436, "journos</w>": 46437, "franchi": 46438, "bottlen": 46439, "alda</w>": 46440, "ffes</w>": 46441, "kx</w>": 46442, "southwe": 46443, "aira</w>": 46444, "clayton": 46445, "scoti": 46446, "fj</w>": 46447, "briga": 46448, "ðŁ¤ĺðŁı»": 46449, "demonstrators</w>": 46450, "yz</w>": 46451, "stork</w>": 46452, "naq": 46453, "cascades</w>": 46454, "travelchat</w>": 46455, "plata</w>": 46456, "padma</w>": 46457, "franci": 46458, "attain</w>": 46459, "batgirl</w>": 46460, "lombard</w>": 46461, "hoos</w>": 46462, "ddos</w>": 46463, "neonatal</w>": 46464, "disclaimer</w>": 46465, "rss": 46466, "rant": 46467, "disen": 46468, "texaste": 46469, "socal": 46470, "fractal</w>": 46471, "camry</w>": 46472, "strife</w>": 46473, "snacking</w>": 46474, "muh</w>": 46475, "santander</w>": 46476, "morons</w>": 46477, "graf": 46478, "parades</w>": 46479, "huston</w>": 46480, "drupal</w>": 46481, "miento</w>": 46482, "kirstel</w>": 46483, "hyde": 46484, "vomit</w>": 46485, "fortified</w>": 46486, "sphinx</w>": 46487, "dav</w>": 46488, "biryani</w>": 46489, "winnings</w>": 46490, "sbaseball</w>": 46491, "merged</w>": 46492, "lovelondon</w>": 46493, "lingering</w>": 46494, "dreambig</w>": 46495, "carleton</w>": 46496, "livelihood</w>": 46497, "django</w>": 46498, "astrid</w>": 46499, "grids</w>": 46500, "downe</w>": 46501, "bruised</w>": 46502, "sne</w>": 46503, "scarecrow</w>": 46504, "helium</w>": 46505, "fnc</w>": 46506, "biggs</w>": 46507, "anter": 46508, "restorative</w>": 46509, "empires</w>": 46510, "abdel": 46511, "lifestyle": 46512, "kiwanis</w>": 46513, "colloquium</w>": 46514, "meen</w>": 46515, "prick</w>": 46516, "antique": 46517, "zeb</w>": 46518, "mimic</w>": 46519, "edmonds</w>": 46520, "ðŁĳĬðŁĳĬ</w>": 46521, "qing": 46522, "ppel</w>": 46523, "mcgill": 46524, "interpreting</w>": 46525, "âŀķ</w>": 46526, "rashad</w>": 46527, "doka</w>": 46528, "narrator</w>": 46529, "electromagnetic</w>": 46530, "ashby</w>": 46531, "saura": 46532, "irandeal</w>": 46533, "âģīï¸ı</w>": 46534, "krishnan</w>": 46535, "indi</w>": 46536, "ffen</w>": 46537, "brea</w>": 46538, "osman</w>": 46539, "multinational</w>": 46540, "chippe": 46541, "recruiters</w>": 46542, "ausbiz</w>": 46543, "pounding</w>": 46544, "regen": 46545, "cursor</w>": 46546, "refusal</w>": 46547, "macs</w>": 46548, "inak": 46549, "axial</w>": 46550, "waifu</w>": 46551, "upcycled</w>": 46552, "hindustan</w>": 46553, "cassini</w>": 46554, "carlyle</w>": 46555, "scratches</w>": 46556, "reef": 46557, "manatee</w>": 46558, "eatery</w>": 46559, "ðŁĵ¢": 46560, "uncondition": 46561, "senpai</w>": 46562, "onther": 46563, "comicbook</w>": 46564, "prosciutto</w>": 46565, "demar</w>": 46566, "mise": 46567, "mage": 46568, "freec": 46569, "ayesha</w>": 46570, "alder</w>": 46571, "androidgames</w>": 46572, "leyton</w>": 46573, "hock</w>": 46574, "doorway</w>": 46575, "chicagofire</w>": 46576, "aaliyah</w>": 46577, "swelling</w>": 46578, "bix": 46579, ".ðŁĺĤ</w>": 46580, "evankirstel</w>": 46581, "torpedo</w>": 46582, "konstant": 46583, "genevieve</w>": 46584, "maia</w>": 46585, "hauser</w>": 46586, "dotorg</w>": 46587, "hideous</w>": 46588, "fik</w>": 46589, "spraw": 46590, "eek</w>": 46591, "zappa</w>": 46592, "wandered</w>": 46593, "''": 46594, "rajan</w>": 46595, "bambi</w>": 46596, "($)</w>": 46597, "widening</w>": 46598, "toolbox</w>": 46599, "sair": 46600, "illuminating</w>": 46601, "prays</w>": 46602, "outpatient</w>": 46603, "iw</w>": 46604, "dayo</w>": 46605, "lob</w>": 46606, "swfl</w>": 46607, "shades": 46608, "gums</w>": 46609, "cookin</w>": 46610, "kodi": 46611, "griffin": 46612, "traumati": 46613, "stea</w>": 46614, "slaughtered</w>": 46615, "godbless": 46616, "airtime</w>": 46617, "pseudo</w>": 46618, "bsa</w>": 46619, "hauled</w>": 46620, "arif</w>": 46621, "à¸Ńà¸ĩ": 46622, "lel</w>": 46623, "wcpo</w>": 46624, "militi": 46625, "charters</w>": 46626, "worlda": 46627, "ruk</w>": 46628, "kgs</w>": 46629, "digitalindia</w>": 46630, "isable</w>": 46631, "idyllic</w>": 46632, "espino": 46633, "marietta</w>": 46634, "ebo</w>": 46635, "teamcanada</w>": 46636, "abour</w>": 46637, "wilton</w>": 46638, "rockstars</w>": 46639, "favored</w>": 46640, "physic": 46641, "wrinkle</w>": 46642, "tbr</w>": 46643, "dprint</w>": 46644, "ballarat</w>": 46645, "adal": 46646, "zey": 46647, "ðŁĺįðŁĶ¥</w>": 46648, "tomlin</w>": 46649, "mtr</w>": 46650, "palsy</w>": 46651, "fenerbah": 46652, "tighten</w>": 46653, "philia</w>": 46654, "ironing</w>": 46655, "ryu": 46656, "bant</w>": 46657, "enquire</w>": 46658, "cair</w>": 46659, "aburger</w>": 46660, "trun</w>": 46661, "greenberg</w>": 46662, "chauhan</w>": 46663, "irina</w>": 46664, "shani": 46665, "trendsetter</w>": 46666, "prett": 46667, "zafar</w>": 46668, "alove": 46669, "vici": 46670, "panic": 46671, "noo</w>": 46672, "lustre</w>": 46673, "disrupted</w>": 46674, "ballis": 46675, "sonsof": 46676, "monsi": 46677, "instac": 46678, "akest</w>": 46679, "ëĭ¤": 46680, "kwame</w>": 46681, "horrormovies</w>": 46682, "district": 46683, "saucy</w>": 46684, "mban</w>": 46685, "armies</w>": 46686, "withdrawn</w>": 46687, "medics</w>": 46688, "loftus</w>": 46689, "eroom</w>": 46690, "bekind</w>": 46691, "arns</w>": 46692, "allon</w>": 46693, "unison</w>": 46694, "davids</w>": 46695, "crat</w>": 46696, "nicotine</w>": 46697, "soor": 46698, "smx</w>": 46699, "onco": 46700, "cosplaying</w>": 46701, "zombies": 46702, "harms</w>": 46703, "eger": 46704, "rosy</w>": 46705, "moonshine</w>": 46706, "fein": 46707, "cett</w>": 46708, "dubrov": 46709, "regents</w>": 46710, "benitez</w>": 46711, "ðŁĳıðŁı¼ðŁĳıðŁı¼</w>": 46712, "stec</w>": 46713, "malia</w>": 46714, "prioritize</w>": 46715, "iceland": 46716, "ftse</w>": 46717, "vamo": 46718, "lamont</w>": 46719, "homosexuality</w>": 46720, "brees</w>": 46721, "regui</w>": 46722, "cbp</w>": 46723, "tej</w>": 46724, "skysports</w>": 46725, "detergent</w>": 46726, "shasta</w>": 46727, "derel": 46728, "conservancy</w>": 46729, "colorized</w>": 46730, "accolades</w>": 46731, "viso</w>": 46732, "showyour": 46733, "nanow": 46734, "biceps</w>": 46735, "usability</w>": 46736, "bim": 46737, "dailysketch</w>": 46738, "pearljam</w>": 46739, "strangest</w>": 46740, "megadeth</w>": 46741, "broadcasts</w>": 46742, "barren</w>": 46743, "arton</w>": 46744, "chriss": 46745, "configu": 46746, "lures</w>": 46747, "isthe": 46748, "eul": 46749, "railwayana</w>": 46750, "globalhealth</w>": 46751, "gianni</w>": 46752, "uaap": 46753, "slum</w>": 46754, "consciously</w>": 46755, "abre</w>": 46756, "nup": 46757, "budget": 46758, "vada</w>": 46759, "esch": 46760, "realness</w>": 46761, "erased</w>": 46762, "thunt</w>": 46763, "bez</w>": 46764, "armistice</w>": 46765, "ðŁĳ¹</w>": 46766, "shrun": 46767, "oled</w>": 46768, "driverless</w>": 46769, "ðŁ¤·ðŁı»âĢįâĻĢï¸ı</w>": 46770, "wondr": 46771, "skan": 46772, "salaam</w>": 46773, "motherland</w>": 46774, "hwang</w>": 46775, "geno</w>": 46776, "gangnam</w>": 46777, "twright</w>": 46778, "endorsing</w>": 46779, "enic": 46780, "adoration</w>": 46781, "paused</w>": 46782, "patricks</w>": 46783, "docked</w>": 46784, "platte</w>": 46785, "ffxv</w>": 46786, "ethnicity</w>": 46787, "autoshow</w>": 46788, "sideshow</w>": 46789, "afterlife</w>": 46790, "relocated</w>": 46791, "orphaned</w>": 46792, "foodnetwork</w>": 46793, "dareto": 46794, "andra": 46795, "slaps</w>": 46796, "vlive</w>": 46797, "swims</w>": 46798, "reimagined</w>": 46799, "mistle": 46800, "revise</w>": 46801, "reality": 46802, "bharti</w>": 46803, "ðŁĴĻðŁĴĽ": 46804, "latest": 46805, "proudest</w>": 46806, "grasses</w>": 46807, "lanyard</w>": 46808, "freshest</w>": 46809, "carcinoma</w>": 46810, "anomaly</w>": 46811, "ziegler</w>": 46812, "sumner</w>": 46813, "lyrix</w>": 46814, "gorg</w>": 46815, "isd": 46816, "avel": 46817, "swildlife</w>": 46818, "mesqu": 46819, "johncena</w>": 46820, "euroleague</w>": 46821, "saber": 46822, "masterful</w>": 46823, "yarra</w>": 46824, "cognition</w>": 46825, "jacobson</w>": 46826, "abolic</w>": 46827, "sirloin</w>": 46828, "shukla</w>": 46829, "mojito</w>": 46830, "supere": 46831, "stweet</w>": 46832, "mez</w>": 46833, "esa": 46834, "rudolf</w>": 46835, "gura</w>": 46836, "whereyou": 46837, "ttm</w>": 46838, "wins": 46839, "trustworthy</w>": 46840, "nyk</w>": 46841, "braden</w>": 46842, "tabletop": 46843, "goodfood</w>": 46844, "eson": 46845, "bek": 46846, "linguistic</w>": 46847, "grays</w>": 46848, "chath": 46849, "hcs</w>": 46850, "moni": 46851, "deans</w>": 46852, "cussions</w>": 46853, "chell</w>": 46854, "slows</w>": 46855, "hemi</w>": 46856, "dapp": 46857, "sharpie</w>": 46858, "boosters</w>": 46859, "aos</w>": 46860, "strack</w>": 46861, "sedona</w>": 46862, "mueller": 46863, "hardwick</w>": 46864, "ornate</w>": 46865, "thora</w>": 46866, "salud</w>": 46867, "otwol": 46868, "chum": 46869, "miho</w>": 46870, "forage</w>": 46871, "thelittle": 46872, "tearful</w>": 46873, "oneself</w>": 46874, "mindy": 46875, "smg</w>": 46876, "gmbh</w>": 46877, "emerald": 46878, "ðŁĶ´âļªï¸ı": 46879, "tutti</w>": 46880, "receptions</w>": 46881, "revising</w>": 46882, "ibrox</w>": 46883, "topeka</w>": 46884, "salami</w>": 46885, "expanse</w>": 46886, "ibooks</w>": 46887, "dobson</w>": 46888, "clio</w>": 46889, "ats": 46890, "ðŁļĮ</w>": 46891, "moha": 46892, "isance</w>": 46893, "shutters</w>": 46894, "moot</w>": 46895, "janine</w>": 46896, "marvelcomics</w>": 46897, "jordani": 46898, "poser</w>": 46899, "kenneth": 46900, "hyung": 46901, "deja</w>": 46902, "aseball</w>": 46903, "speciality</w>": 46904, "euston</w>": 46905, "classiccar</w>": 46906, "hadith</w>": 46907, "ðŁĲī</w>": 46908, "chasing": 46909, "izo</w>": 46910, "grosven": 46911, "aglia</w>": 46912, "thisdayinhistory</w>": 46913, "trow</w>": 46914, "omile</w>": 46915, "huar": 46916, "byn": 46917, "saline</w>": 46918, "divine": 46919, "demonic</w>": 46920, "tyran": 46921, "handover</w>": 46922, "revitalization</w>": 46923, "paella</w>": 46924, "cryptic</w>": 46925, "sedg": 46926, "mend</w>": 46927, "dunkirk</w>": 46928, "bred": 46929, "wald": 46930, "sportscar</w>": 46931, "aard": 46932, "wheaton</w>": 46933, "daener": 46934, "klan</w>": 46935, "brt</w>": 46936, "bakhtawar": 46937, "spires</w>": 46938, "schubert</w>": 46939, "roti</w>": 46940, "polish": 46941, "ose": 46942, "agame</w>": 46943, "wondercon</w>": 46944, "protestant</w>": 46945, "bosa</w>": 46946, "ðŁĺŁ</w>": 46947, "dÃ¼": 46948, "joyride</w>": 46949, "gertrude</w>": 46950, "âĿĿ</w>": 46951, "gila</w>": 46952, "vh": 46953, "twa</w>": 46954, "trav</w>": 46955, "swallowed</w>": 46956, "starve</w>": 46957, "lain": 46958, "entren": 46959, "reiki</w>": 46960, "sukh": 46961, "craic</w>": 46962, "azu</w>": 46963, "webpage</w>": 46964, "keefe</w>": 46965, "hypothe": 46966, "hirsch</w>": 46967, "helle</w>": 46968, "campground</w>": 46969, "wamy</w>": 46970, "travi": 46971, "shahi</w>": 46972, "sandeep</w>": 46973, "rui</w>": 46974, "hanuman</w>": 46975, "dwp</w>": 46976, "repository</w>": 46977, "noor": 46978, "noff</w>": 46979, "unreal": 46980, "pell</w>": 46981, "blackhistory</w>": 46982, "harvick</w>": 46983, "mascar": 46984, "payee</w>": 46985, "pasha</w>": 46986, "gastronomy</w>": 46987, "dÃŃ": 46988, "aig</w>": 46989, "rosenthal</w>": 46990, "openday</w>": 46991, "embellished</w>": 46992, "ttip</w>": 46993, "sunbathing</w>": 46994, "gopack": 46995, "endome": 46996, "ï¸ı#</w>": 46997, "invalid</w>": 46998, "finalfour</w>": 46999, "stfu</w>": 47000, "squishy</w>": 47001, "rasta</w>": 47002, "mosch": 47003, "jamesc": 47004, "dietrich</w>": 47005, "sela</w>": 47006, "melb": 47007, "elvi": 47008, "tdp</w>": 47009, "suni</w>": 47010, "slit</w>": 47011, "jha": 47012, "biza</w>": 47013, "spiked</w>": 47014, "lli": 47015, "lillard</w>": 47016, "vampi": 47017, "synopsis</w>": 47018, "azhar</w>": 47019, "kendricklamar</w>": 47020, "ĮãĤĬãģŁãģĦ</w>": 47021, "heartless</w>": 47022, "countryfile</w>": 47023, "airplay</w>": 47024, "arrogance</w>": 47025, "pree</w>": 47026, "virtuoso</w>": 47027, "ãħłãħłãħłãħł": 47028, "raju</w>": 47029, "lebu": 47030, "forward": 47031, "tug": 47032, "dros</w>": 47033, "mondaymotivaton</w>": 47034, "concepcion</w>": 47035, "thelo": 47036, "padi</w>": 47037, "looool</w>": 47038, "ÑĢÐ¾Ð´": 47039, "itss": 47040, "ethical": 47041, "enduro</w>": 47042, "__:</w>": 47043, "expenditure</w>": 47044, "monste": 47045, "masking</w>": 47046, "terriers</w>": 47047, "ibis</w>": 47048, "ember</w>": 47049, "cumple</w>": 47050, "punctuation</w>": 47051, "piper": 47052, "irvin</w>": 47053, "adee</w>": 47054, "yyyyyy</w>": 47055, "flashbacks</w>": 47056, "celsius</w>": 47057, "donnie": 47058, "bogota</w>": 47059, "benevol": 47060, "thescript</w>": 47061, "shilpa": 47062, "prose": 47063, "findia</w>": 47064, "zeke</w>": 47065, "neko</w>": 47066, "doves</w>": 47067, "blueslyrix</w>": 47068, "frosh</w>": 47069, "soweto</w>": 47070, "mplo": 47071, "alai</w>": 47072, "sabi</w>": 47073, "raqqa</w>": 47074, "wftv</w>": 47075, "stroller</w>": 47076, "iansomerhalder</w>": 47077, "ðŁĶª": 47078, "anon": 47079, "moseley</w>": 47080, "!?!?</w>": 47081, "staking</w>": 47082, "moly</w>": 47083, "cartri": 47084, "csg</w>": 47085, "astor</w>": 47086, "transcend": 47087, "maer": 47088, "deux</w>": 47089, "cowgirl</w>": 47090, "sask": 47091, "punter</w>": 47092, "maken": 47093, "oates</w>": 47094, "lovett</w>": 47095, "growler</w>": 47096, "sagin": 47097, "vn": 47098, "ssible</w>": 47099, "officeofrg</w>": 47100, "ymc": 47101, "sabar": 47102, "faulty</w>": 47103, "apha</w>": 47104, "akon</w>": 47105, "ðŁĳ«": 47106, "snowdon</w>": 47107, "aew</w>": 47108, "raisethe": 47109, "ðĿĵ": 47110, "gruesome</w>": 47111, "clementine</w>": 47112, "sping</w>": 47113, "lata</w>": 47114, "worldenviron": 47115, "mimic": 47116, "canaria</w>": 47117, "bakhtawarbz</w>": 47118, "aoa</w>": 47119, "fala": 47120, "ãĤŃ": 47121, "aviva</w>": 47122, "youuuu</w>": 47123, "thigh": 47124, "ladders</w>": 47125, "gumbo</w>": 47126, "tzky</w>": 47127, "fuzz": 47128, "plasticpollution</w>": 47129, "estate": 47130, "strengthened</w>": 47131, "kant</w>": 47132, "drin</w>": 47133, "calvert</w>": 47134, "transformational</w>": 47135, "frightened</w>": 47136, "maclean</w>": 47137, "elitedangerous</w>": 47138, "earthy</w>": 47139, "tson</w>": 47140, "toda</w>": 47141, "jnu</w>": 47142, "..,</w>": 47143, "michal": 47144, "iban": 47145, "jeong": 47146, "isreal</w>": 47147, "simcoe</w>": 47148, "exclusives</w>": 47149, "bluebells</w>": 47150, "bene</w>": 47151, "teu": 47152, "pilsner</w>": 47153, "penske</w>": 47154, "atheists</w>": 47155, "mpu": 47156, "cartagena</w>": 47157, "ðŁĴĹðŁĴĹ": 47158, "millionaires</w>": 47159, "kkkk</w>": 47160, "itar</w>": 47161, "subscriptions</w>": 47162, "remote": 47163, "mafi": 47164, "hinton</w>": 47165, "wcc": 47166, "hok</w>": 47167, "dsb": 47168, "ableton</w>": 47169, "seventy</w>": 47170, "punks</w>": 47171, "eindhoven</w>": 47172, "shone</w>": 47173, "mcfarlane</w>": 47174, "limpopo</w>": 47175, "emphasi": 47176, "Ã¼</w>": 47177, "sinfo</w>": 47178, "petre": 47179, "mangrove</w>": 47180, "chino": 47181, "bertie</w>": 47182, "playlists</w>": 47183, "pushawards": 47184, "paf": 47185, "debbie": 47186, "cdo</w>": 47187, "rino</w>": 47188, "ðŁı¾âĢįâĻĤï¸ı</w>": 47189, "folke": 47190, "bonnar": 47191, "thine</w>": 47192, "slan</w>": 47193, "halter</w>": 47194, "evie</w>": 47195, "awsome</w>": 47196, "vultures</w>": 47197, "sparky</w>": 47198, "seizures</w>": 47199, "âľĶ": 47200, "ramone</w>": 47201, "ineffe": 47202, "aln": 47203, "proctor</w>": 47204, "astra": 47205, "thevoice": 47206, "grote": 47207, "scion</w>": 47208, "deadline": 47209, "amaya</w>": 47210, "tainted</w>": 47211, "patterned</w>": 47212, "exceeding</w>": 47213, "crossfit": 47214, "kaylee</w>": 47215, "dropbox</w>": 47216, "rushes</w>": 47217, "tackled</w>": 47218, "moby</w>": 47219, "retrogamer</w>": 47220, "ncbd</w>": 47221, "benefitting</w>": 47222, "shaykh</w>": 47223, "guildhall</w>": 47224, "gentry</w>": 47225, "dreamcast</w>": 47226, "dreaded</w>": 47227, "bundled</w>": 47228, "thaw</w>": 47229, "revolving</w>": 47230, "npt</w>": 47231, "kyliejenner</w>": 47232, "imaginative</w>": 47233, "roni</w>": 47234, "overcame</w>": 47235, "familytime</w>": 47236, "dsburg</w>": 47237, "carnaval</w>": 47238, "relationship": 47239, "recognizable</w>": 47240, "coroner</w>": 47241, "hole": 47242, "fanfic</w>": 47243, "emirates": 47244, "burritos</w>": 47245, "analyse</w>": 47246, "thinner</w>": 47247, "nees</w>": 47248, "gallipoli</w>": 47249, "blr</w>": 47250, "catwoman</w>": 47251, "-->></w>": 47252, "ault": 47253, "adaily</w>": 47254, "naughty": 47255, "ilio</w>": 47256, "solitaire</w>": 47257, "mtvbr": 47258, "jocelyn</w>": 47259, "arunach": 47260, "repent": 47261, "southgate</w>": 47262, "hyacin": 47263, "essential": 47264, "fenton</w>": 47265, "andum</w>": 47266, "itor": 47267, "gopal</w>": 47268, "slinger</w>": 47269, "posei": 47270, "awil": 47271, "wielding</w>": 47272, "raila</w>": 47273, "elias": 47274, "asto": 47275, "Ã¤</w>": 47276, "tendency</w>": 47277, "strata</w>": 47278, "kert</w>": 47279, "<-</w>": 47280, "imacele": 47281, "daes": 47282, "stimulus</w>": 47283, "hanley</w>": 47284, "fitnes": 47285, "ecstasy</w>": 47286, "limous": 47287, "hailing</w>": 47288, "ðŁ¤Ń</w>": 47289, "chiswick</w>": 47290, "taries</w>": 47291, "slav</w>": 47292, "puli</w>": 47293, "modernization</w>": 47294, "blackmail</w>": 47295, "bingham</w>": 47296, "hfx": 47297, "++": 47298, "ðŁĩ®ðŁĩ³": 47299, "niv</w>": 47300, "wea</w>": 47301, "professor": 47302, "koff</w>": 47303, "bolster</w>": 47304, "suave</w>": 47305, "sequences</w>": 47306, "pepperoni</w>": 47307, "notte</w>": 47308, "dren</w>": 47309, "ãģ¨ç¹ĭãģ": 47310, "hsv</w>": 47311, "oga</w>": 47312, "aptly</w>": 47313, "zad": 47314, "excelsi": 47315, "rinka</w>": 47316, "moldova</w>": 47317, "minn</w>": 47318, "mabel</w>": 47319, "conferencing</w>": 47320, "basing": 47321, "ofer": 47322, "obsi": 47323, "hamillhimself</w>": 47324, "careless</w>": 47325, "briefed</w>": 47326, "inherent</w>": 47327, "parish": 47328, "dubnation</w>": 47329, "townsville</w>": 47330, "sarawak</w>": 47331, "geeky</w>": 47332, "doncasterisgreat</w>": 47333, "wasabi</w>": 47334, "gup</w>": 47335, "pheno": 47336, "drainthe": 47337, "carrieunderwood</w>": 47338, "bleeds</w>": 47339, "bbcworld</w>": 47340, "anew</w>": 47341, "altaf</w>": 47342, "dulwich</w>": 47343, "aniston</w>": 47344, "wti</w>": 47345, "sumatra</w>": 47346, "grafton</w>": 47347, "bln</w>": 47348, "mester</w>": 47349, "bodega</w>": 47350, "rego</w>": 47351, "esq</w>": 47352, "anjo</w>": 47353, "sumptuous</w>": 47354, "maisie</w>": 47355, "ï¿½": 47356, "wilt</w>": 47357, "jakob</w>": 47358, "elvis": 47359, "sepul": 47360, "muster</w>": 47361, "airpollution</w>": 47362, "presidente</w>": 47363, "happymonday</w>": 47364, "extensively</w>": 47365, "flondon</w>": 47366, "tls</w>": 47367, "playing": 47368, "peed</w>": 47369, "dinho</w>": 47370, "vardy</w>": 47371, "pika</w>": 47372, "niro</w>": 47373, "aucus</w>": 47374, "ðŁį¦": 47375, "null</w>": 47376, "elondon</w>": 47377, "juventus": 47378, "imagines</w>": 47379, "disab": 47380, "lito</w>": 47381, "dura</w>": 47382, "workplaces</w>": 47383, "promote": 47384, "mccaf": 47385, "woodwork</w>": 47386, "wawx</w>": 47387, "à®ª</w>": 47388, "ttino</w>": 47389, "shari</w>": 47390, "semper": 47391, "bettertogether</w>": 47392, "ðŁĳĬðŁı»": 47393, "zebra": 47394, "pondering</w>": 47395, "enchil": 47396, "hom</w>": 47397, "cosmic": 47398, "tanz": 47399, "mocked</w>": 47400, "eccc</w>": 47401, "athed</w>": 47402, "abolish</w>": 47403, "propeller</w>": 47404, "parisagreement</w>": 47405, "assemblies</w>": 47406, "industry": 47407, "fraudulent</w>": 47408, "pesa</w>": 47409, "changmin</w>": 47410, "axx": 47411, "ðŁĴµ": 47412, "irrational</w>": 47413, "cusa</w>": 47414, "ramadhan</w>": 47415, "octavia</w>": 47416, "onelove</w>": 47417, "jacki": 47418, "barak": 47419, "taxider": 47420, "serious": 47421, "nathanfillion</w>": 47422, "mcen": 47423, "chk</w>": 47424, "popart</w>": 47425, "gravity": 47426, "coppola</w>": 47427, "readingfc</w>": 47428, "illusions</w>": 47429, "jig</w>": 47430, "wwx</w>": 47431, "resh</w>": 47432, "exporting</w>": 47433, "buzzard</w>": 47434, "âĻ¤</w>": 47435, "pcm</w>": 47436, "lanapar": 47437, "kos": 47438, "aromas</w>": 47439, "antalya</w>": 47440, "wwdc</w>": 47441, "vena</w>": 47442, "phila</w>": 47443, "ballin": 47444, "ðŁĳĦ</w>": 47445, "quinta</w>": 47446, "mao": 47447, "fery</w>": 47448, "eighty</w>": 47449, "sentiments</w>": 47450, "safeguarding</w>": 47451, "rwa</w>": 47452, "puffs</w>": 47453, "lucille</w>": 47454, "decath": 47455, "slu</w>": 47456, "nugent</w>": 47457, "deter</w>": 47458, "brazil": 47459, "zeiss</w>": 47460, "superbowl": 47461, "subsidy</w>": 47462, "altern": 47463, "hidalgo</w>": 47464, "enzymes</w>": 47465, "ä½": 47466, "tagne</w>": 47467, "hairdresser</w>": 47468, "adrien</w>": 47469, "walkout</w>": 47470, "opposes</w>": 47471, "cantina</w>": 47472, "bedside</w>": 47473, "afan": 47474, "ðŁĶĹ": 47475, "prophetic</w>": 47476, "danes</w>": 47477, "unsuccessful</w>": 47478, "supercharged</w>": 47479, "pkk</w>": 47480, "exemption</w>": 47481, "hartle": 47482, "secular": 47483, "clipping</w>": 47484, "brs</w>": 47485, "unitedway": 47486, "cnet</w>": 47487, "patchy</w>": 47488, "hagan</w>": 47489, "een": 47490, "âļľ": 47491, "vara</w>": 47492, "sympathi": 47493, "nevertrump</w>": 47494, "affirmation</w>": 47495, "omf</w>": 47496, "nycfc</w>": 47497, "maja</w>": 47498, "surro": 47499, "keerth": 47500, "upscale</w>": 47501, "sandalwood</w>": 47502, "monarchy</w>": 47503, "knobs</w>": 47504, "åĭ": 47505, "potholes</w>": 47506, "hungergames</w>": 47507, "terraces</w>": 47508, "nasir</w>": 47509, "counsell": 47510, "welcometo": 47511, "waq": 47512, "seaman</w>": 47513, "mita</w>": 47514, "stunningly</w>": 47515, "ontheroad</w>": 47516, "inability</w>": 47517, ")!!</w>": 47518, "bongo</w>": 47519, "antv</w>": 47520, "sput": 47521, "worldenvironmentday</w>": 47522, "resusc": 47523, "ytd</w>": 47524, "fim</w>": 47525, "eunhyuk</w>": 47526, "sachin": 47527, "roseanne</w>": 47528, "clermont</w>": 47529, "apec</w>": 47530, "amina</w>": 47531, "vening</w>": 47532, "nantes</w>": 47533, "almost": 47534, "sinus</w>": 47535, "exas</w>": 47536, "tyl</w>": 47537, "tien</w>": 47538, "plead</w>": 47539, "lancs</w>": 47540, "burnaby</w>": 47541, "rek": 47542, "joom": 47543, "observers</w>": 47544, "discography</w>": 47545, "clg</w>": 47546, "âĻ¦</w>": 47547, "snack": 47548, "rti</w>": 47549, "oily</w>": 47550, "crystalli": 47551, "brute</w>": 47552, "webdevelopment</w>": 47553, "toppings</w>": 47554, "laf": 47555, "anis</w>": 47556, "adder</w>": 47557, "reliving</w>": 47558, "carlin</w>": 47559, "battleof": 47560, "weg</w>": 47561, "syrian": 47562, "pont": 47563, "ndc</w>": 47564, "laghate": 47565, "yuma</w>": 47566, "spp</w>": 47567, "piti": 47568, "robbing</w>": 47569, "marting": 47570, "reykja": 47571, "rajput</w>": 47572, "ncds</w>": 47573, "kiewicz</w>": 47574, "âĢ¢âĢ¢</w>": 47575, "vampire": 47576, "substantially</w>": 47577, "opioids</w>": 47578, "nepali</w>": 47579, "kline</w>": 47580, "aroo</w>": 47581, "understand": 47582, "litt</w>": 47583, "uit</w>": 47584, "thrombo": 47585, "saries</w>": 47586, "quot</w>": 47587, "balling</w>": 47588, "ttr": 47589, "sgh</w>": 47590, "philipp</w>": 47591, "brant</w>": 47592, "acl": 47593, "mello</w>": 47594, "whittaker</w>": 47595, ".;</w>": 47596, "defiant</w>": 47597, "bgc</w>": 47598, "replying</w>": 47599, "mirren</w>": 47600, "metamorpho": 47601, "schwab</w>": 47602, "bulge</w>": 47603, "utilized</w>": 47604, "pickering</w>": 47605, "pardon": 47606, "dsa</w>": 47607, "à¸Ī": 47608, "dooley</w>": 47609, "cumulative</w>": 47610, "Ð»": 47611, "urgency</w>": 47612, "emir</w>": 47613, "+/-</w>": 47614, "¦Ī</w>": 47615, "otas</w>": 47616, "âı³</w>": 47617, "stationed</w>": 47618, "grapevine</w>": 47619, "arac": 47620, "karanjohar</w>": 47621, "fancy": 47622, "saul": 47623, "coogs</w>": 47624, "lgbtq": 47625, "Ø§Ùħ": 47626, "javi</w>": 47627, "ummer</w>": 47628, "pll": 47629, "denis": 47630, "daipur</w>": 47631, "puffin</w>": 47632, "lewisham</w>": 47633, "fandom": 47634, "cope": 47635, "vesmatter</w>": 47636, "sve": 47637, "helpless</w>": 47638, "deodor": 47639, "ostrich</w>": 47640, "kazan</w>": 47641, "fridaythe</w>": 47642, "condor</w>": 47643, "vx</w>": 47644, "sophomores</w>": 47645, "robles</w>": 47646, "cutt</w>": 47647, "climbers</w>": 47648, "ë¦¬": 47649, "sleg</w>": 47650, "snf</w>": 47651, "macys</w>": 47652, "hydrating</w>": 47653, "groupe</w>": 47654, "poyn": 47655, "moulin</w>": 47656, "hgtv</w>": 47657, "lmfaooo</w>": 47658, "sulphur</w>": 47659, "asdfghjkl</w>": 47660, "annabelle</w>": 47661, "humpback</w>": 47662, "braved</w>": 47663, "viswasam</w>": 47664, "multipurpose</w>": 47665, "humidi": 47666, "escorted</w>": 47667, "barbican</w>": 47668, "fad</w>": 47669, "corsa</w>": 47670, "ðŁ¤«</w>": 47671, "pippa</w>": 47672, "hereto": 47673, "cany": 47674, "sergi": 47675, "orcas</w>": 47676, "ovie": 47677, "edou": 47678, "sany": 47679, "globalization</w>": 47680, "mancini</w>": 47681, "foodtruck</w>": 47682, "fis</w>": 47683, "defibrill": 47684, "schre": 47685, "smafia</w>": 47686, "lovewins</w>": 47687, "laut": 47688, "kaka</w>": 47689, "hollande</w>": 47690, "gameon</w>": 47691, "resurgence</w>": 47692, "outside": 47693, "olympiad</w>": 47694, "intan": 47695, "abstraction</w>": 47696, "rapid": 47697, "palom": 47698, "calle": 47699, "jasmin</w>": 47700, "attackers</w>": 47701, "swagg</w>": 47702, "mitra</w>": 47703, "kylo</w>": 47704, "à®²</w>": 47705, "hermitage</w>": 47706, "gordo</w>": 47707, "eira</w>": 47708, "sosfam</w>": 47709, "rollout</w>": 47710, "excite</w>": 47711, "synod</w>": 47712, "merrill</w>": 47713, "cals</w>": 47714, "assa</w>": 47715, "livelihoods</w>": 47716, "juve": 47717, "theblack": 47718, "gopackgo</w>": 47719, "antlers</w>": 47720, "albanian</w>": 47721, "woolly</w>": 47722, "quiche</w>": 47723, "purification</w>": 47724, "areth</w>": 47725, "smarthome</w>": 47726, "nek</w>": 47727, "allblacks</w>": 47728, "mexicans</w>": 47729, "ism": 47730, "germs</w>": 47731, "complexion</w>": 47732, "marck</w>": 47733, "ushi</w>": 47734, "ðŁĲĲ": 47735, "charl": 47736, "castic</w>": 47737, "tillerson</w>": 47738, "giuliani</w>": 47739, "biodegradable</w>": 47740, "malbec</w>": 47741, "bois": 47742, "jubil": 47743, "imes</w>": 47744, "rame</w>": 47745, "genetic": 47746, "espnu</w>": 47747, "chley</w>": 47748, "soho": 47749, "gopher": 47750, "gsc</w>": 47751, "buuren</w>": 47752, "cube": 47753, "bridesmaids</w>": 47754, "webinars</w>": 47755, "toe": 47756, "manipur</w>": 47757, "violently</w>": 47758, "noticias</w>": 47759, "exchanging</w>": 47760, "chiev": 47761, "replaceable</w>": 47762, "muaythai</w>": 47763, "buss</w>": 47764, "spil": 47765, "instalment</w>": 47766, "divya</w>": 47767, "caitlin": 47768, "olim": 47769, "filtering</w>": 47770, "whirlwind</w>": 47771, "stared</w>": 47772, "priorit": 47773, "pram": 47774, "pompeii</w>": 47775, "monologue</w>": 47776, "kite": 47777, "buka</w>": 47778, "âĢ¦..</w>": 47779, "vaccine": 47780, "brero</w>": 47781, "wozni": 47782, "solent</w>": 47783, "referr": 47784, "myrt": 47785, "gridiron</w>": 47786, "galatasaray</w>": 47787, "froze</w>": 47788, "claremont</w>": 47789, "ðŁ¥ĥ</w>": 47790, "victorias": 47791, "sseldorf</w>": 47792, "pastures</w>": 47793, "netneutrality</w>": 47794, "chor</w>": 47795, "ðŁĳģ": 47796, "à²¿</w>": 47797, "weho</w>": 47798, "symptom</w>": 47799, "josel": 47800, "inous</w>": 47801, "dragoncon</w>": 47802, "powerball</w>": 47803, "pte</w>": 47804, "fourthofjuly</w>": 47805, "ecla": 47806, "earbuds</w>": 47807, "whereabouts</w>": 47808, "saltlife</w>": 47809, "deprivation</w>": 47810, "chter</w>": 47811, "wiggle</w>": 47812, "system": 47813, "psst</w>": 47814, "chaz": 47815, "dany</w>": 47816, "rimo</w>": 47817, "oaxaca</w>": 47818, "lanaparrilla</w>": 47819, "barcelon": 47820, "melancholy</w>": 47821, "wayback": 47822, "hotro": 47823, "nsi": 47824, "lilly": 47825, "kuro</w>": 47826, "jahan</w>": 47827, "intellect</w>": 47828, "boardgame</w>": 47829, "ðŁıĬ</w>": 47830, "sneakpeek</w>": 47831, "kprc</w>": 47832, "jails</w>": 47833, "candel": 47834, "zanzi": 47835, "mortimer</w>": 47836, "starch</w>": 47837, "rags</w>": 47838, "pfa</w>": 47839, "longlive": 47840, "kart": 47841, "girona</w>": 47842, "crocker</w>": 47843, "christoph</w>": 47844, "precautions</w>": 47845, "warship</w>": 47846, "perm</w>": 47847, "parent": 47848, "vangogh</w>": 47849, "gifford</w>": 47850, "allegheny</w>": 47851, "rayn": 47852, "utm</w>": 47853, "stencil</w>": 47854, "recalling</w>": 47855, "penney</w>": 47856, "zazzle</w>": 47857, "ìĥĿ": 47858, "hinds</w>": 47859, "arenas</w>": 47860, "nuev": 47861, "lawler</w>": 47862, "guin</w>": 47863, "dothis</w>": 47864, "ðŁĳķ</w>": 47865, "ì¶ķíķĺ": 47866, "weg": 47867, "tib": 47868, "ridin</w>": 47869, "complexes</w>": 47870, "turbulent</w>": 47871, "pesos</w>": 47872, "demarcus</w>": 47873, "vallarta</w>": 47874, "samsun": 47875, "kisses": 47876, "heinrich</w>": 47877, "deportes</w>": 47878, "wilms": 47879, "urd</w>": 47880, "thenext": 47881, "inkigayo</w>": 47882, "howi": 47883, "firsts</w>": 47884, "carriage": 47885, "cleanliness</w>": 47886, "maswar": 47887, "isch</w>": 47888, "axel": 47889, "sizzle</w>": 47890, "roadhouse</w>": 47891, "frans</w>": 47892, "entourage</w>": 47893, "cobble": 47894, "booth": 47895, "benedict": 47896, "talon</w>": 47897, "fcu</w>": 47898, "yearofthe": 47899, "rayon</w>": 47900, "raidernation</w>": 47901, "foyle</w>": 47902, "koval": 47903, "pianos</w>": 47904, "lpg</w>": 47905, "burmese</w>": 47906, "manure</w>": 47907, "geocaching</w>": 47908, "coscino</w>": 47909, "bnp</w>": 47910, "ferra": 47911, "strophy</w>": 47912, "marais</w>": 47913, "cees</w>": 47914, "legendof": 47915, "katniss</w>": 47916, "enoch</w>": 47917, "aved</w>": 47918, "youknow": 47919, "dprk</w>": 47920, "ðŁĺ¢ðŁĺ¢</w>": 47921, "spun": 47922, "prost</w>": 47923, "sorrows</w>": 47924, "centred</w>": 47925, "kea</w>": 47926, "galicia</w>": 47927, "?ðŁ¤Ķ</w>": 47928, "ÑĢÐ¾Ð´Ð°</w>": 47929, "bouchard</w>": 47930, "ðŁĴĻðŁĴľ": 47931, "yui</w>": 47932, "seedlings</w>": 47933, "jonah": 47934, "recovers</w>": 47935, "nyrd</w>": 47936, "boardroom</w>": 47937, "suma</w>": 47938, "myjaps</w>": 47939, "tung": 47940, "shai</w>": 47941, "irgc</w>": 47942, "elio</w>": 47943, "wagons</w>": 47944, "kashi": 47945, "policemen</w>": 47946, "johnnie</w>": 47947, "alecoscino</w>": 47948, "shopify</w>": 47949, "dotted</w>": 47950, "detri": 47951, "vaw</w>": 47952, "tofficial</w>": 47953, "inyour": 47954, "chalmers</w>": 47955, "traced</w>": 47956, "novi": 47957, "byes</w>": 47958, "ariel": 47959, "nippon</w>": 47960, "lapel</w>": 47961, "griez": 47962, "bgs</w>": 47963, "fooling</w>": 47964, "dita</w>": 47965, "vijaysethu": 47966, "nmwx</w>": 47967, "asot</w>": 47968, "kranti</w>": 47969, "helm": 47970, "vedi</w>": 47971, "sickest</w>": 47972, "mochi</w>": 47973, "kabo": 47974, "shrubs</w>": 47975, "hered": 47976, "bsp</w>": 47977, "sqm</w>": 47978, "hamr</w>": 47979, "dulkar</w>": 47980, "antha</w>": 47981, "nrf</w>": 47982, "avoidance</w>": 47983, "aten</w>": 47984, "publix</w>": 47985, "bearers</w>": 47986, "nasi</w>": 47987, "hap</w>": 47988, "hells</w>": 47989, "ðŁĸ¥</w>": 47990, "à¸·</w>": 47991, "thelastjedi</w>": 47992, "ohwx</w>": 47993, "ðŁį«": 47994, "wahoo</w>": 47995, "therese</w>": 47996, "recaps</w>": 47997, "ssnhq</w>": 47998, "birdphotography</w>": 47999, "vay": 48000, "petti": 48001, "paulo": 48002, "belvedere</w>": 48003, "(*": 48004, "grl</w>": 48005, "duvet</w>": 48006, "cpec</w>": 48007, "sait": 48008, "porsch": 48009, "measurable</w>": 48010, "aviators</w>": 48011, "fremantle</w>": 48012, "breen</w>": 48013, "onom": 48014, "meand": 48015, "lifesaving</w>": 48016, "euref</w>": 48017, "endon</w>": 48018, "embaras": 48019, "airasia</w>": 48020, "elis</w>": 48021, "dunkin": 48022, "starmagic": 48023, "sill</w>": 48024, "portobello</w>": 48025, "kiefer</w>": 48026, "exe</w>": 48027, "muted</w>": 48028, "ãģ¦": 48029, "wethepeople</w>": 48030, "logia</w>": 48031, "liberal": 48032, "theforceawakens</w>": 48033, "mined</w>": 48034, "haunts</w>": 48035, "freckles</w>": 48036, "caretaker</w>": 48037, "sindia</w>": 48038, "âķĲ": 48039, "devlin</w>": 48040, "liston</w>": 48041, "directioner</w>": 48042, "ohn</w>": 48043, "figaro</w>": 48044, "emmanuel": 48045, "dubois</w>": 48046, "clones</w>": 48047, "bruise</w>": 48048, "ðŁİĪðŁİī</w>": 48049, "disinfe": 48050, "dermatology</w>": 48051, "asr</w>": 48052, "swatch</w>": 48053, "discomfort</w>": 48054, "tamanna": 48055, "piday</w>": 48056, "macken": 48057, "katic</w>": 48058, "delusional</w>": 48059, "shawnee</w>": 48060, "gud": 48061, "albino</w>": 48062, "pali": 48063, "dingh": 48064, "cucumbers</w>": 48065, "coffey</w>": 48066, "anticipating</w>": 48067, "treasured</w>": 48068, "websummit</w>": 48069, "sheltered</w>": 48070, "savor</w>": 48071, "pedagogy</w>": 48072, "mgs</w>": 48073, "shma</w>": 48074, "sbu": 48075, "denali</w>": 48076, "campos</w>": 48077, "bubblegum</w>": 48078, "oir": 48079, "leaps</w>": 48080, "yler</w>": 48081, "rone": 48082, "sanskrit</w>": 48083, "mint": 48084, "meatless": 48085, "futurist</w>": 48086, "dude": 48087, "avel</w>": 48088, "protested</w>": 48089, "squire</w>": 48090, "zaki</w>": 48091, "szn</w>": 48092, "harcourt</w>": 48093, "cyclone": 48094, "bourdain</w>": 48095, "gatherings</w>": 48096, "dant": 48097, "adventurer</w>": 48098, "paragon</w>": 48099, "altman</w>": 48100, "dding": 48101, "banerjee</w>": 48102, "snorkeling</w>": 48103, "motherwell</w>": 48104, "missy": 48105, "ender": 48106, "glows</w>": 48107, "kiwis</w>": 48108, "chickpea</w>": 48109, "poro": 48110, "efron</w>": 48111, "appt</w>": 48112, "uy</w>": 48113, "specified</w>": 48114, "gabby": 48115, "estrada</w>": 48116, "combos</w>": 48117, "bourbon": 48118, "vini</w>": 48119, "varun": 48120, "stephani": 48121, "keywords</w>": 48122, "carvings</w>": 48123, "amitabh</w>": 48124, "wrought</w>": 48125, "twal": 48126, "reels</w>": 48127, "clubbing</w>": 48128, "ubiquit": 48129, "crit</w>": 48130, "ambedkar</w>": 48131, "æĻ": 48132, "pruning</w>": 48133, "vaccinated</w>": 48134, "boeing": 48135, "sks</w>": 48136, "loona</w>": 48137, "hypnosis</w>": 48138, "edelman</w>": 48139, "phol</w>": 48140, "hew": 48141, "colosse": 48142, "mckinsey</w>": 48143, "uon": 48144, "tote": 48145, "sacrificing</w>": 48146, "oxi</w>": 48147, "nang": 48148, "emu": 48149, "Ð¿ÑĢÐ¸ÑĢÐ¾Ð´Ð°</w>": 48150, "mth</w>": 48151, "kerswednesday</w>": 48152, "argued</w>": 48153, "timelapse</w>": 48154, "risking</w>": 48155, "regulating</w>": 48156, "nigh</w>": 48157, "likelihood</w>": 48158, "cubic": 48159, "auction": 48160, "reinfor": 48161, "pistor": 48162, "noses</w>": 48163, "yel</w>": 48164, "snuggles</w>": 48165, "pei": 48166, "jeanette</w>": 48167, "taku</w>": 48168, "rith": 48169, "guyz</w>": 48170, "à¸ŀ</w>": 48171, "yte</w>": 48172, "verted</w>": 48173, "paysoff</w>": 48174, "jauregui</w>": 48175, "hooligans</w>": 48176, "procedural</w>": 48177, "mib</w>": 48178, "hardy": 48179, "eleng": 48180, "checkers</w>": 48181, "alline</w>": 48182, "themet</w>": 48183, "proudof": 48184, "keerthyofficial</w>": 48185, "collaborator</w>": 48186, "niu</w>": 48187, "inflicted</w>": 48188, "advani</w>": 48189, "retwee": 48190, "memoriam</w>": 48191, "ficial</w>": 48192, "tighter</w>": 48193, "salem": 48194, "reviewers</w>": 48195, "brics</w>": 48196, "bendigo</w>": 48197, "amell</w>": 48198, "turkish": 48199, "sushmaswar": 48200, "paulson</w>": 48201, "palawan</w>": 48202, "mollie</w>": 48203, "stitcher</w>": 48204, "sburgh</w>": 48205, "iru</w>": 48206, "haydn</w>": 48207, "eners</w>": 48208, "aroa</w>": 48209, "uzzi</w>": 48210, "sarajevo</w>": 48211, "hela</w>": 48212, "apollo": 48213, "ninety</w>": 48214, "vaca</w>": 48215, "spon</w>": 48216, "ventu": 48217, "jelena</w>": 48218, "heifer</w>": 48219, "avoids</w>": 48220, "spine": 48221, "prize": 48222, "marist</w>": 48223, "recreating</w>": 48224, "mede</w>": 48225, "wooden": 48226, "findlay</w>": 48227, "rofl</w>": 48228, "ndi</w>": 48229, "comprehend</w>": 48230, "yugo": 48231, "yÃ¼": 48232, "towork</w>": 48233, "ufos</w>": 48234, "sonar</w>": 48235, "piston</w>": 48236, "recording": 48237, "tentative</w>": 48238, "artforsale</w>": 48239, "pellets</w>": 48240, "fredo</w>": 48241, "ÙĪØ±": 48242, "muses</w>": 48243, "customization</w>": 48244, "profound": 48245, "isner</w>": 48246, "ideally</w>": 48247, "siam</w>": 48248, "plankton</w>": 48249, "cmdr</w>": 48250, "manger</w>": 48251, "franken</w>": 48252, "customizable</w>": 48253, "à¤®": 48254, "walkaway</w>": 48255, "swivel</w>": 48256, "vastly</w>": 48257, "noton": 48258, "lexa</w>": 48259, "exmoor</w>": 48260, "zas</w>": 48261, "tante</w>": 48262, "reductions</w>": 48263, "lolly</w>": 48264, "hipsters</w>": 48265, "benefited</w>": 48266, "ë²": 48267, "wwwww</w>": 48268, "masculine</w>": 48269, "fiji": 48270, "drey": 48271, "phill</w>": 48272, "aneous</w>": 48273, "nicol</w>": 48274, "mendez</w>": 48275, "disappro": 48276, "chner</w>": 48277, "throughs</w>": 48278, "shenmue</w>": 48279, "eastman</w>": 48280, "ðŁĲİ": 48281, "yuck</w>": 48282, "undertale</w>": 48283, "reys</w>": 48284, "gobeavs</w>": 48285, "engen</w>": 48286, "cna</w>": 48287, "merr": 48288, "birk": 48289, "ãģ¨ç¹ĭãģĮãĤĬãģŁãģĦ</w>": 48290, "âĥ£@</w>": 48291, "ynna</w>": 48292, "steed</w>": 48293, "offender</w>": 48294, "atum</w>": 48295, "vanishing</w>": 48296, "presidenti": 48297, "lovethem</w>": 48298, "gnocchi</w>": 48299, "friggin</w>": 48300, "peril</w>": 48301, "madhya</w>": 48302, "agne</w>": 48303, "deejay": 48304, "marnock</w>": 48305, "mtb": 48306, "foldable</w>": 48307, "@___</w>": 48308, "standre": 48309, "bronx": 48310, "bowski</w>": 48311, "finite</w>": 48312, "crockett</w>": 48313, "bsf</w>": 48314, "getit</w>": 48315, "serenawilliams</w>": 48316, "miro</w>": 48317, "ignatius</w>": 48318, "slay": 48319, "rinse</w>": 48320, "fondue</w>": 48321, "seldom</w>": 48322, "smore</w>": 48323, "gani</w>": 48324, "dyce</w>": 48325, "dmitry</w>": 48326, "crumb": 48327, "latepost</w>": 48328, "primark</w>": 48329, "ohana</w>": 48330, "florals</w>": 48331, "doa</w>": 48332, "remembranceday</w>": 48333, "dds</w>": 48334, "azione</w>": 48335, "toonami</w>": 48336, "airport": 48337, "æĿ±": 48338, "thad": 48339, "fist": 48340, "dinesh</w>": 48341, "drwho</w>": 48342, "adwords</w>": 48343, "admirer</w>": 48344, "proje": 48345, "kyrgyz": 48346, "à«": 48347, "manifestation</w>": 48348, "lewan": 48349, "jic": 48350, "thibau": 48351, "leased</w>": 48352, "vanity": 48353, "nourished</w>": 48354, "nevertheless</w>": 48355, "augmente": 48356, "fuelled</w>": 48357, "chead": 48358, "wilshere</w>": 48359, "rudi": 48360, "pz</w>": 48361, "myco": 48362, "morro</w>": 48363, "herbalife</w>": 48364, "hardrock": 48365, "deman</w>": 48366, "dreality</w>": 48367, "spades</w>": 48368, "cevic": 48369, "bhai": 48370, "baron": 48371, "ultimatefan": 48372, "hounews</w>": 48373, "tobi</w>": 48374, "strut</w>": 48375, "keel</w>": 48376, "affiliation</w>": 48377, "themasters</w>": 48378, "smal": 48379, "hue": 48380, "esteban</w>": 48381, "conv</w>": 48382, "omnic": 48383, "databases</w>": 48384, "cov</w>": 48385, "terti": 48386, "stg</w>": 48387, "snoopdogg</w>": 48388, "metabol": 48389, "lethbridge</w>": 48390, "ðŁı»âĢįâĻĢï¸ı": 48391, "yearling</w>": 48392, "residentevil</w>": 48393, "nwsl</w>": 48394, "iyaki</w>": 48395, "griezmann</w>": 48396, "cous</w>": 48397, "ðŁĵĿ:</w>": 48398, "torian</w>": 48399, "sami": 48400, "ðŁĶ¥ðŁĶ¥ðŁĶ¥ðŁĶ¥ðŁĶ¥</w>": 48401, "gare</w>": 48402, "alliances</w>": 48403, "whitfield</w>": 48404, "wether</w>": 48405, "refining</w>": 48406, "coyi</w>": 48407, "kraken</w>": 48408, "ðŁĺĺâĿ¤</w>": 48409, "singularity</w>": 48410, "lili</w>": 48411, "hns</w>": 48412, "boldand": 48413, "wawrinka</w>": 48414, "misogyny</w>": 48415, "lovers": 48416, "cq</w>": 48417, "bdg</w>": 48418, "adona</w>": 48419, "garter</w>": 48420, "womenof": 48421, "scd</w>": 48422, "recognising</w>": 48423, "muna</w>": 48424, "strou": 48425, "signalling</w>": 48426, "laredo</w>": 48427, "hellboy</w>": 48428, "aleksand": 48429, "unavailable</w>": 48430, "pediatric": 48431, "asin": 48432, "meria</w>": 48433, "rishi": 48434, "futurism</w>": 48435, "wye": 48436, "polarized</w>": 48437, "ewe</w>": 48438, "propel</w>": 48439, "informs</w>": 48440, "crease</w>": 48441, "~\"</w>": 48442, "artiston": 48443, "likefor": 48444, "heidelberg</w>": 48445, "erra</w>": 48446, "lifein": 48447, "lenny": 48448, "interrupt</w>": 48449, "coherent</w>": 48450, "caz": 48451, "vickers</w>": 48452, "leveled</w>": 48453, "fbs</w>": 48454, "cabins</w>": 48455, "bummed</w>": 48456, "apostles</w>": 48457, "weh": 48458, "tendon</w>": 48459, "souvenirs</w>": 48460, "infuri": 48461, "pierce": 48462, "asset": 48463, "mlas</w>": 48464, "goth": 48465, "diggin</w>": 48466, "annas": 48467, "ylor</w>": 48468, "thwaite</w>": 48469, "swel": 48470, "panera</w>": 48471, "murderers</w>": 48472, "crooked": 48473, "bsgo</w>": 48474, "acu</w>": 48475, "aon</w>": 48476, "rean</w>": 48477, "oneof": 48478, "kohl</w>": 48479, "bloodh": 48480, "pesticide</w>": 48481, "lostdog</w>": 48482, "flexing</w>": 48483, "ëĤĺ": 48484, "supra</w>": 48485, "eternally</w>": 48486, "ðŁļĻ</w>": 48487, "paolo": 48488, "olan": 48489, "momo": 48490, "iselle</w>": 48491, "captainmarvel</w>": 48492, "slou": 48493, "mistakenly</w>": 48494, "akhilesh</w>": 48495, "mert</w>": 48496, "ilinan</w>": 48497, "buon</w>": 48498, "balkan</w>": 48499, "mirro": 48500, "millen": 48501, "derail": 48502, "damon": 48503, "titi</w>": 48504, "bios</w>": 48505, "redon": 48506, "picard</w>": 48507, "parte</w>": 48508, "ðŁ¤Ł": 48509, "Øº": 48510, "sonics</w>": 48511, "firsth": 48512, "ddc</w>": 48513, "vegans</w>": 48514, "turban</w>": 48515, "nigan</w>": 48516, "lottie</w>": 48517, "lyndon</w>": 48518, "starbuck": 48519, "pinkfloyd</w>": 48520, "lifestyles</w>": 48521, "amara</w>": 48522, "ashe": 48523, "rsc</w>": 48524, "vala</w>": 48525, "smer": 48526, "cwgc</w>": 48527, "client": 48528, "buenas</w>": 48529, "jagan</w>": 48530, "coops</w>": 48531, "ðŁĳĳðŁĳĳ": 48532, "specializes</w>": 48533, "snagged</w>": 48534, "glar": 48535, "bennet</w>": 48536, "wildlifewednesday</w>": 48537, "bowden</w>": 48538, "pik</w>": 48539, "artin</w>": 48540, "emporium</w>": 48541, "arl</w>": 48542, "reba</w>": 48543, "passer</w>": 48544, "disappoints</w>": 48545, "additive</w>": 48546, "âľĬðŁı½</w>": 48547, "bayer": 48548, "missoula</w>": 48549, "haskell</w>": 48550, "commences</w>": 48551, "nix": 48552, "neman</w>": 48553, "exploited</w>": 48554, "plasticsurgery</w>": 48555, "ccd</w>": 48556, "asocial</w>": 48557, "vot</w>": 48558, "siegel</w>": 48559, "froome</w>": 48560, "kapam": 48561, "fara</w>": 48562, "eha</w>": 48563, "probes</w>": 48564, "mwf</w>": 48565, "meeting": 48566, "pbb": 48567, "akins</w>": 48568, "mistletoe</w>": 48569, "kingdomhearts</w>": 48570, "forkids</w>": 48571, "ecr</w>": 48572, "bale": 48573, "escorts</w>": 48574, "adidasoriginals</w>": 48575, "kwa</w>": 48576, "kts</w>": 48577, "halloffame</w>": 48578, "ðŁĺį.</w>": 48579, "wags</w>": 48580, "potted</w>": 48581, "owing</w>": 48582, "honeycomb</w>": 48583, "hefty</w>": 48584, "urology</w>": 48585, "merle</w>": 48586, "bpd</w>": 48587, "stripping</w>": 48588, "reich": 48589, "kstate": 48590, "guay": 48591, "yonge</w>": 48592, "shakti": 48593, "gloom</w>": 48594, "batt</w>": 48595, "sonom": 48596, "nery</w>": 48597, "elba</w>": 48598, "blanks</w>": 48599, "helle": 48600, "triplets</w>": 48601, "bombay": 48602, "akarta</w>": 48603, "abia</w>": 48604, "transmitted</w>": 48605, "rolf</w>": 48606, "jais": 48607, "angularjs</w>": 48608, "fierc": 48609, "mss</w>": 48610, "trace": 48611, "à¥ĩ": 48612, "tombs</w>": 48613, "oldman</w>": 48614, "kombucha</w>": 48615, "fol</w>": 48616, "ehealth</w>": 48617, "cereals</w>": 48618, "arelli</w>": 48619, "inari</w>": 48620, "ðŁĴ©": 48621, "wol</w>": 48622, "liberties</w>": 48623, "fawn</w>": 48624, "affirm</w>": 48625, "nunavut</w>": 48626, "hysterical</w>": 48627, "kdrama</w>": 48628, "artes</w>": 48629, "âĢ¢âĢ¢âĢ¢âĢ¢âĢ¢âĢ¢âĢ¢âĢ¢": 48630, "valentin</w>": 48631, "manslaughter</w>": 48632, "gales</w>": 48633, "eoin</w>": 48634, "energized</w>": 48635, "dels</w>": 48636, "withdraws</w>": 48637, "stles</w>": 48638, "sarcastic</w>": 48639, "ramesh": 48640, "incredibles</w>": 48641, "lockhart</w>": 48642, "yawn</w>": 48643, "ultimatefanlive</w>": 48644, "oooooooooooooooo": 48645, "muen": 48646, "gurudev</w>": 48647, "teer</w>": 48648, "peeling</w>": 48649, "newsnow</w>": 48650, "linguistics</w>": 48651, "directv</w>": 48652, "agend": 48653, "unilever</w>": 48654, "ruger</w>": 48655, "handedly</w>": 48656, "erose</w>": 48657, "limel": 48658, "thec": 48659, "royalties</w>": 48660, "finishers</w>": 48661, "nrg</w>": 48662, "mgt</w>": 48663, "fidget</w>": 48664, "comps</w>": 48665, "bacon": 48666, "aggressively</w>": 48667, "abit</w>": 48668, "chÃ¢": 48669, "tarde</w>": 48670, "slugger</w>": 48671, "qanda</w>": 48672, "greening</w>": 48673, "dats</w>": 48674, "enslaved</w>": 48675, "spector</w>": 48676, "oye": 48677, "freef": 48678, "bhand": 48679, "stopbrexit</w>": 48680, "misconceptions</w>": 48681, "cava</w>": 48682, "ðŁĺįðŁĺįðŁĺįðŁĺįðŁĺįðŁĺįðŁĺįðŁĺį": 48683, "multitasking</w>": 48684, "housel": 48685, "ferreira</w>": 48686, "centime": 48687, "ankles</w>": 48688, "jodh": 48689, "helly</w>": 48690, "frome</w>": 48691, "outtuesday</w>": 48692, "narnia</w>": 48693, "balaji</w>": 48694, "lbloggers</w>": 48695, "jyoti</w>": 48696, "ðŁįĩ</w>": 48697, "lancia</w>": 48698, "capri": 48699, "yap": 48700, "natash": 48701, "downfall</w>": 48702, ".\"âĢĶ</w>": 48703, "Ã®": 48704, "ligament</w>": 48705, "coatings</w>": 48706, "aided</w>": 48707, "hiko</w>": 48708, "falling": 48709, "encrypted</w>": 48710, "yegfood</w>": 48711, "infringement</w>": 48712, "cudi</w>": 48713, "cep</w>": 48714, "ðŁĺįðŁĺĤ</w>": 48715, "trad": 48716, "superrugby</w>": 48717, "edwin": 48718, "whiche": 48719, "vimeo</w>": 48720, "layne</w>": 48721, "invigor": 48722, "hehe": 48723, "dubrovnik</w>": 48724, "bieber": 48725, "utr": 48726, "shaman</w>": 48727, "opers</w>": 48728, "hamill</w>": 48729, "enig</w>": 48730, "dif</w>": 48731, "arum</w>": 48732, "scrapbook</w>": 48733, "minh</w>": 48734, "divergence</w>": 48735, "mckinnon</w>": 48736, "lifetime": 48737, "guterres</w>": 48738, "wille": 48739, "pleas</w>": 48740, "patty": 48741, "micron": 48742, "kz": 48743, "domaine</w>": 48744, "rusher</w>": 48745, "mds</w>": 48746, "chesney</w>": 48747, "screwdriver</w>": 48748, "âģ©,</w>": 48749, "sledge</w>": 48750, "hauer</w>": 48751, "chana</w>": 48752, "stamina</w>": 48753, "sprinkler</w>": 48754, "pln</w>": 48755, "heff": 48756, "bolton": 48757, "omon": 48758, "carrington</w>": 48759, "accordion</w>": 48760, "jorge": 48761, "interception</w>": 48762, "inputs</w>": 48763, "gull": 48764, "transcription</w>": 48765, "vanuatu</w>": 48766, "itical</w>": 48767, "ethos</w>": 48768, "tich</w>": 48769, "spacey</w>": 48770, "peeking</w>": 48771, "umi": 48772, "hager": 48773, "psychotic</w>": 48774, "illian": 48775, "illia</w>": 48776, "bonnaroo</w>": 48777, "anese</w>": 48778, "puc": 48779, "laghateparth</w>": 48780, "enhall</w>": 48781, "economical</w>": 48782, "dredge</w>": 48783, "%-</w>": 48784, "uwe</w>": 48785, "tubular</w>": 48786, "scouncil</w>": 48787, "peasants</w>": 48788, "fler</w>": 48789, "tumbler</w>": 48790, "hep</w>": 48791, "fordham</w>": 48792, "rowley</w>": 48793, "initials</w>": 48794, "evasion</w>": 48795, "ernation</w>": 48796, "plugins</w>": 48797, "cochran</w>": 48798, "cattle": 48799, "acidity</w>": 48800, "ðŁİĬðŁİī</w>": 48801, "regrann</w>": 48802, "jumpman</w>": 48803, "eface</w>": 48804, "xma": 48805, "patriarchy</w>": 48806, "escobar</w>": 48807, "cristian</w>": 48808, "tipton</w>": 48809, "nueva</w>": 48810, "hackney": 48811, "backseat</w>": 48812, "killarney</w>": 48813, "aidan": 48814, "stadion</w>": 48815, "simultaneous</w>": 48816, "idaho": 48817, "aje": 48818, "uth": 48819, "figure": 48820, "clos</w>": 48821, "burk": 48822, "voluntar": 48823, "recite</w>": 48824, "macfarlane</w>": 48825, "curfew</w>": 48826, "boudo": 48827, "wgn": 48828, "stix</w>": 48829, "slap": 48830, "scratched</w>": 48831, "phillip": 48832, "journe": 48833, "expelled</w>": 48834, "waz</w>": 48835, "uke": 48836, "tatiana</w>": 48837, "oue</w>": 48838, "hopp": 48839, "dimitri</w>": 48840, "ðŁĵ£": 48841, "matologist</w>": 48842, "electrifying</w>": 48843, "bluffs</w>": 48844, "billsmafia</w>": 48845, "azcardinals</w>": 48846, "yaa": 48847, "xmas": 48848, "shara</w>": 48849, "rith</w>": 48850, "gills</w>": 48851, "dres": 48852, "barton": 48853, "authorization</w>": 48854, "imperialism</w>": 48855, "homeof": 48856, "todo": 48857, "footpath</w>": 48858, "bandwidth</w>": 48859, "visitspain</w>": 48860, "mohsin</w>": 48861, "erupted</w>": 48862, "miki</w>": 48863, "insignia</w>": 48864, "mikel</w>": 48865, "ssh</w>": 48866, "gera</w>": 48867, "bankholiday": 48868, "awan": 48869, "tweak</w>": 48870, "starcraft</w>": 48871, "eal": 48872, "construction": 48873, "skeletons</w>": 48874, "leep": 48875, "inem</w>": 48876, "barclay": 48877, "shipwreck</w>": 48878, "monsieur</w>": 48879, "yoh</w>": 48880, "ront</w>": 48881, "formative</w>": 48882, "sero": 48883, "lep": 48884, "horseman</w>": 48885, "hoosier</w>": 48886, "hazmat</w>": 48887, "cylinders</w>": 48888, "centi": 48889, "ðŁĴ¥ðŁĴ¥ðŁĴ¥</w>": 48890, "reem</w>": 48891, "naire</w>": 48892, "musically</w>": 48893, "grasshopper</w>": 48894, "estonian</w>": 48895, "terminology</w>": 48896, "romain</w>": 48897, "bloggerrt</w>": 48898, "toxin</w>": 48899, "stance": 48900, "cultivated</w>": 48901, "anast": 48902, "ðŁĲį": 48903, "shimano</w>": 48904, "gopher</w>": 48905, "enei</w>": 48906, "recyclable</w>": 48907, "gamification</w>": 48908, "fightfor": 48909, "cq": 48910, "avocados</w>": 48911, "keys": 48912, "elike": 48913, "glycer": 48914, "shakur</w>": 48915, "mobilization</w>": 48916, "galley</w>": 48917, "explain": 48918, "exchanged</w>": 48919, "peth</w>": 48920, "obedience</w>": 48921, "illage</w>": 48922, "ennis": 48923, "ãĥŀ": 48924, "wiv</w>": 48925, "wallabies</w>": 48926, "maar</w>": 48927, "igers</w>": 48928, "fintech": 48929, "finalized</w>": 48930, "woj": 48931, "meaningless</w>": 48932, "infield</w>": 48933, "onnaise</w>": 48934, "eet</w>": 48935, "bronte</w>": 48936, "passages</w>": 48937, "ðŁĳ§": 48938, "strickland</w>": 48939, "northernlights</w>": 48940, "lomond</w>": 48941, "htc": 48942, "wray</w>": 48943, "shifter</w>": 48944, "dialog</w>": 48945, "ðŁįį</w>": 48946, ">>>>>></w>": 48947, "teatime</w>": 48948, "stech": 48949, "sichuan</w>": 48950, "quill</w>": 48951, "franca": 48952, "complementary</w>": 48953, "barrington</w>": 48954, "marcus": 48955, "malam</w>": 48956, "goooo</w>": 48957, "forsa": 48958, "electra</w>": 48959, "afs</w>": 48960, "âĹĨ</w>": 48961, "trife": 48962, "snazzy</w>": 48963, "folia</w>": 48964, "andolan</w>": 48965, "afterdark</w>": 48966, "woodson</w>": 48967, "strade</w>": 48968, "littlest</w>": 48969, "ogun": 48970, "conwy</w>": 48971, "cowards</w>": 48972, "ðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤðŁĺĤ</w>": 48973, "íĬ¸": 48974, "seul": 48975, "murphy": 48976, "dunks</w>": 48977, "kapilshar": 48978, "joachim</w>": 48979, "womack</w>": 48980, "equality": 48981, "averages</w>": 48982, "aine": 48983, "ðŁ¦Ī</w>": 48984, "tacular</w>": 48985, "disability": 48986, "uked": 48987, "midcentury</w>": 48988, "barthol": 48989, "teasers</w>": 48990, "tabern": 48991, "njcaa</w>": 48992, "spout</w>": 48993, "opi</w>": 48994, "kubball</w>": 48995, "blom": 48996, "soar": 48997, "populism</w>": 48998, "methyl": 48999, "ðŁĳĬðŁı¼": 49000, "ospre": 49001, "aloils</w>": 49002, "ðŁĵĸ": 49003, "ðŁĮļ": 49004, "xer": 49005, "spilling</w>": 49006, "publica</w>": 49007, "cardam": 49008, "adish</w>": 49009, "sacha</w>": 49010, "pkg</w>": 49011, "buda</w>": 49012, "lyricist</w>": 49013, "ibc</w>": 49014, "grump": 49015, "hover</w>": 49016, "halep</w>": 49017, "antibody</w>": 49018, "anemone</w>": 49019, "âĻ¥âĻ¥âĻ¥âĻ¥": 49020, "mcl": 49021, "lithograph</w>": 49022, "ccu</w>": 49023, "sfest</w>": 49024, "pathic</w>": 49025, "callister</w>": 49026, "ottawa": 49027, "gunsn": 49028, "rutger": 49029, "halibut</w>": 49030, "envision</w>": 49031, "differentiate</w>": 49032, "ðŁļĢðŁļĢ": 49033, "piran": 49034, "latel": 49035, "ucn</w>": 49036, "troubad": 49037, "raine": 49038, "fiercely</w>": 49039, "learnenglish</w>": 49040, "lease": 49041, "wexmondays</w>": 49042, "emit</w>": 49043, "drayton</w>": 49044, "burrell</w>": 49045, "scubadiving</w>": 49046, "holler</w>": 49047, "dru</w>": 49048, "clocked</w>": 49049, "wral</w>": 49050, "apro</w>": 49051, "translucent</w>": 49052, "wbo</w>": 49053, "patriarch</w>": 49054, "moja": 49055, "lannister</w>": 49056, "fishery</w>": 49057, "nederland</w>": 49058, "mildly</w>": 49059, "mirai</w>": 49060, "mako</w>": 49061, "jap</w>": 49062, "ðŁĺ©ðŁĺ©ðŁĺ©</w>": 49063, "prostatec": 49064, "panna</w>": 49065, "arama</w>": 49066, "undertaking</w>": 49067, "tompkins</w>": 49068, "neop": 49069, "solids</w>": 49070, "savoury</w>": 49071, "eames</w>": 49072, "cutlery</w>": 49073, "woodbridge</w>": 49074, "steamer</w>": 49075, "rizzo</w>": 49076, "wildcat": 49077, "ratna</w>": 49078, "laminated</w>": 49079, "kineni</w>": 49080, "jalap": 49081, "aides</w>": 49082, "acknowledges</w>": 49083, "?!?!?!</w>": 49084, "!ðŁİī</w>": 49085, "wafc</w>": 49086, "maggio</w>": 49087, "haves</w>": 49088, "darje": 49089, "ofi</w>": 49090, "gril": 49091, "vasi": 49092, "brux": 49093, "mohd</w>": 49094, "fakespeare</w>": 49095, "arnold": 49096, "rmb</w>": 49097, "forbe": 49098, "walleye</w>": 49099, "rodi": 49100, "therapeutics</w>": 49101, "strategi": 49102, "obste": 49103, "mudder</w>": 49104, "downloadable</w>": 49105, "ddings</w>": 49106, "dca</w>": 49107, "asiangames</w>": 49108, "campeon": 49109, "appropriation</w>": 49110, "thcentury</w>": 49111, "ramatta</w>": 49112, "draped</w>": 49113, "bullion</w>": 49114, "muc</w>": 49115, "onex</w>": 49116, "segreg": 49117, "ophelia</w>": 49118, "bodily</w>": 49119, "âĿ¤ðŁĺį</w>": 49120, "wizar": 49121, "teased</w>": 49122, "ademy</w>": 49123, "toid</w>": 49124, "sura</w>": 49125, "lazarus</w>": 49126, "snickers</w>": 49127, "mase": 49128, "loh": 49129, "bowed</w>": 49130, "biblio": 49131, "xchange</w>": 49132, "harlan</w>": 49133, "ghoshal</w>": 49134, "flavorful</w>": 49135, "bhagat</w>": 49136, "allez</w>": 49137, "whichever</w>": 49138, "tenstein</w>": 49139, "discer": 49140, "organiser</w>": 49141, "mtg": 49142, "dreamliner</w>": 49143, "tse": 49144, "hokkaido</w>": 49145, "mok": 49146, "indulgent</w>": 49147, "hickman</w>": 49148, "blinded</w>": 49149, "alyn": 49150, "aaaah</w>": 49151, "spool</w>": 49152, "loughborough</w>": 49153, "interpret": 49154, "etv": 49155, "aristotle</w>": 49156, "optimizing</w>": 49157, "avicii</w>": 49158, "madurai</w>": 49159, "juli</w>": 49160, "nawaz": 49161, "matchups</w>": 49162, "abide</w>": 49163, "painting": 49164, "welling</w>": 49165, "veli</w>": 49166, "octagon</w>": 49167, "inscribed</w>": 49168, "poking</w>": 49169, "placer</w>": 49170, "lifecycle</w>": 49171, "kilig</w>": 49172, "gsp</w>": 49173, "elives</w>": 49174, "clements</w>": 49175, "nasheed</w>": 49176, "mesut</w>": 49177, "incarcerated</w>": 49178, "distilled</w>": 49179, "walang</w>": 49180, "delicacy</w>": 49181, "delgado</w>": 49182, "chez": 49183, "chita</w>": 49184, "adero</w>": 49185, "tux</w>": 49186, "patil</w>": 49187, "odo": 49188, "abhcosmetics</w>": 49189, "tvc</w>": 49190, "pbc</w>": 49191, "inaccurate</w>": 49192, "hardworkpaysoff</w>": 49193, "baller": 49194, "quotation</w>": 49195, "merchandising</w>": 49196, "gastri": 49197, "defenses</w>": 49198, "drogba</w>": 49199, "bexhill</w>": 49200, "bankno": 49201, "winona</w>": 49202, "sieg": 49203, "pgs</w>": 49204, "hahahha</w>": 49205, "aguchi</w>": 49206, "subram": 49207, "miracle": 49208, "desch": 49209, "libre": 49210, "bacher</w>": 49211, "entine</w>": 49212, "bbcradi": 49213, "loudest</w>": 49214, "rps</w>": 49215, "pierc": 49216, "fryer</w>": 49217, "stormtrooper</w>": 49218, "rafaelnadal</w>": 49219, "pasco</w>": 49220, "exhaustion</w>": 49221, "epiconetsy</w>": 49222, "rctid</w>": 49223, "kellie</w>": 49224, "gaines</w>": 49225, "dbz</w>": 49226, "smriti": 49227, "sbridge</w>": 49228, "limited": 49229, "claw": 49230, "technical": 49231, "biographical</w>": 49232, "adored</w>": 49233, "à¸°</w>": 49234, "exclude</w>": 49235, "acadia</w>": 49236, "keyboards</w>": 49237, "furman</w>": 49238, "soca</w>": 49239, "suru</w>": 49240, "nips</w>": 49241, "swaps</w>": 49242, "serverless</w>": 49243, "rune</w>": 49244, "puffy</w>": 49245, "northampton": 49246, "nishings</w>": 49247, "hender": 49248, "cartridges</w>": 49249, "gunshot</w>": 49250, "ðŁĵ¹</w>": 49251, "filament</w>": 49252, "respondents</w>": 49253, "peyton": 49254, "mountaineer</w>": 49255, "merging</w>": 49256, "lifespan</w>": 49257, "intimidation</w>": 49258, "pafc</w>": 49259, "nlwx</w>": 49260, "expansive</w>": 49261, "purr": 49262, "fck</w>": 49263, "cae</w>": 49264, "atti": 49265, "telethon</w>": 49266, "sohn</w>": 49267, "mendel": 49268, "lopes</w>": 49269, "dori</w>": 49270, "unbroken</w>": 49271, "tered": 49272, "tastings</w>": 49273, "inactive</w>": 49274, "disintegr": 49275, "tassel</w>": 49276, "sharethe": 49277, "piano": 49278, "islay</w>": 49279, "airspace</w>": 49280, "zawa</w>": 49281, "ricciardo</w>": 49282, "mington": 49283, "fresher</w>": 49284, "curry": 49285, "revs</w>": 49286, "pharoah</w>": 49287, "hmv</w>": 49288, "exhilarating</w>": 49289, "whoo</w>": 49290, "linkin</w>": 49291, "krispy</w>": 49292, "competency</w>": 49293, "stewards</w>": 49294, "nebu": 49295, "katsu": 49296, "admins</w>": 49297, "bazar</w>": 49298, "asar</w>": 49299, "givingback</w>": 49300, "ssummit</w>": 49301, "songz</w>": 49302, "linus</w>": 49303, "rajkumar</w>": 49304, "farmington</w>": 49305, "fantasia</w>": 49306, "ðŁĺ´ðŁĺ´</w>": 49307, "sobri": 49308, "lisse</w>": 49309, "barrymore</w>": 49310, "prism": 49311, "blob</w>": 49312, "senew": 49313, "monoxide</w>": 49314, "expire</w>": 49315, "eighteen</w>": 49316, "dipper</w>": 49317, "xiao</w>": 49318, "kilt</w>": 49319, "hinch": 49320, "bbcsport</w>": 49321, "bamboo": 49322, "pter": 49323, "exal": 49324, "ðŁ¦ĭ": 49325, "hamlin</w>": 49326, "expeditions</w>": 49327, "stargazing</w>": 49328, "foodsecurity</w>": 49329, "wylie</w>": 49330, "ulf</w>": 49331, "stingly</w>": 49332, "onstorm</w>": 49333, "loeb</w>": 49334, "broome</w>": 49335, "bnha</w>": 49336, "pancreatic</w>": 49337, "elive": 49338, "!!!!!!!!!!!</w>": 49339, "therapper</w>": 49340, "orthopedic</w>": 49341, "avengersendgame</w>": 49342, "antitrust</w>": 49343, "ìļ°</w>": 49344, "gote</w>": 49345, "omd</w>": 49346, "offside</w>": 49347, "gyllen": 49348, "wineries</w>": 49349, "whitewater</w>": 49350, "adl</w>": 49351, "lupita</w>": 49352, "exceeds</w>": 49353, "consisted</w>": 49354, "chewbacca</w>": 49355, "ashleigh</w>": 49356, "nhljets</w>": 49357, "issan": 49358, "shld</w>": 49359, "hayat</w>": 49360, "cranberries</w>": 49361, "ðŁ¤ĺðŁı½</w>": 49362, "rockthe": 49363, "springtraining</w>": 49364, "fallout": 49365, "dairyfree</w>": 49366, "waj</w>": 49367, "undecided</w>": 49368, "sown</w>": 49369, "rcn</w>": 49370, "northwales</w>": 49371, "httr</w>": 49372, "fumble</w>": 49373, "dits</w>": 49374, "compelled</w>": 49375, "populist</w>": 49376, "minted</w>": 49377, "blanchett</w>": 49378, ".''</w>": 49379, "propulsion</w>": 49380, "milla</w>": 49381, "auberg": 49382, "hertz</w>": 49383, "hta</w>": 49384, "udaipur</w>": 49385, "serendipity</w>": 49386, "aztecs</w>": 49387, "alsace</w>": 49388, "ðŁĲĳ</w>": 49389, "lun</w>": 49390, "shoes": 49391, "charli</w>": 49392, "garza</w>": 49393, "ðŁĴŁ": 49394, "probiotics</w>": 49395, "foxtv</w>": 49396, "olis</w>": 49397, "miff": 49398, "localized</w>": 49399, "diffuser</w>": 49400, "sigue</w>": 49401, "funko": 49402, "rendous</w>": 49403, "ðŁĴĳ</w>": 49404, "jekyll</w>": 49405, "<|startoftext|>": 49406, "<|endoftext|>": 49407}
\ No newline at end of file
diff --git a/stable-diffusion.cpp/rng.h b/stable-diffusion.cpp/rng.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8942605be75781d6cfde9fdc1e11720eef3f856
--- /dev/null
+++ b/stable-diffusion.cpp/rng.h
@@ -0,0 +1,35 @@
+#ifndef __RNG_H__
+#define __RNG_H__
+
+#include <random>
+#include <vector>
+
+class RNG {
+   public:
+    virtual void manual_seed(uint64_t seed) = 0;
+    virtual std::vector<float> randn(uint32_t n) = 0;
+};
+
+class STDDefaultRNG : public RNG {
+   private:
+    std::default_random_engine generator;
+
+   public:
+    void manual_seed(uint64_t seed) {
+        generator.seed(seed);
+    }
+
+    std::vector<float> randn(uint32_t n) {
+        std::vector<float> result;
+        float mean = 0.0;
+        float stddev = 1.0;
+        std::normal_distribution<float> distribution(mean, stddev);
+        for (int i = 0; i < n; i++) {
+            float random_number = distribution(generator);
+            result.push_back(random_number);
+        }
+        return result;
+    }
+};
+
+#endif  // __RNG_H__
\ No newline at end of file
diff --git a/stable-diffusion.cpp/rng_philox.h b/stable-diffusion.cpp/rng_philox.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9b70fc261792edd5c1abf7d3411560e12ef8bfb
--- /dev/null
+++ b/stable-diffusion.cpp/rng_philox.h
@@ -0,0 +1,125 @@
+#ifndef __RNG_PHILOX_H__
+#define __RNG_PHILOX_H__
+
+#include <cmath>
+#include <vector>
+
+#include "rng.h"
+
+// RNG imitiating torch cuda randn on CPU.
+// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
+class PhiloxRNG : public RNG {
+   private:
+    uint64_t seed;
+    uint32_t offset;
+
+   private:
+    std::vector<uint32_t> philox_m = {0xD2511F53, 0xCD9E8D57};
+    std::vector<uint32_t> philox_w = {0x9E3779B9, 0xBB67AE85};
+    float two_pow32_inv = 2.3283064e-10;
+    float two_pow32_inv_2pi = 2.3283064e-10 * 6.2831855;
+
+    std::vector<uint32_t> uint32(uint64_t x) {
+        std::vector<uint32_t> result(2);
+        result[0] = static_cast<uint32_t>(x & 0xFFFFFFFF);
+        result[1] = static_cast<uint32_t>(x >> 32);
+        return result;
+    }
+
+    std::vector<std::vector<uint32_t>> uint32(const std::vector<uint64_t>& x) {
+        int N = x.size();
+        std::vector<std::vector<uint32_t>> result(2, std::vector<uint32_t>(N));
+
+        for (int i = 0; i < N; ++i) {
+            result[0][i] = static_cast<uint32_t>(x[i] & 0xFFFFFFFF);
+            result[1][i] = static_cast<uint32_t>(x[i] >> 32);
+        }
+
+        return result;
+    }
+
+    //  A single round of the Philox 4x32 random number generator.
+    void philox4_round(std::vector<std::vector<uint32_t>>& counter,
+                       const std::vector<std::vector<uint32_t>>& key) {
+        uint32_t N = counter[0].size();
+        for (uint32_t i = 0; i < N; i++) {
+            std::vector<uint32_t> v1 = uint32(static_cast<uint64_t>(counter[0][i]) * static_cast<uint64_t>(philox_m[0]));
+            std::vector<uint32_t> v2 = uint32(static_cast<uint64_t>(counter[2][i]) * static_cast<uint64_t>(philox_m[1]));
+
+            counter[0][i] = v2[1] ^ counter[1][i] ^ key[0][i];
+            counter[1][i] = v2[0];
+            counter[2][i] = v1[1] ^ counter[3][i] ^ key[1][i];
+            counter[3][i] = v1[0];
+        }
+    }
+
+    // Generates 32-bit random numbers using the Philox 4x32 random number generator.
+    // Parameters:
+    //     counter : A 4xN array of 32-bit integers representing the counter values (offset into generation).
+    //     key : A 2xN array of 32-bit integers representing the key values (seed).
+    //     rounds : The number of rounds to perform.
+    // Returns:
+    //     std::vector<std::vector<uint32_t>>: A 4xN array of 32-bit integers containing the generated random numbers.
+    std::vector<std::vector<uint32_t>> philox4_32(std::vector<std::vector<uint32_t>>& counter,
+                                                  std::vector<std::vector<uint32_t>>& key,
+                                                  int rounds = 10) {
+        uint32_t N = counter[0].size();
+        for (int i = 0; i < rounds - 1; ++i) {
+            philox4_round(counter, key);
+
+            for (uint32_t j = 0; j < N; ++j) {
+                key[0][j] += philox_w[0];
+                key[1][j] += philox_w[1];
+            }
+        }
+
+        philox4_round(counter, key);
+        return counter;
+    }
+
+    float box_muller(float x, float y) {
+        float u = x * two_pow32_inv + two_pow32_inv / 2;
+        float v = y * two_pow32_inv_2pi + two_pow32_inv_2pi / 2;
+
+        float s = sqrt(-2.0 * log(u));
+
+        float r1 = s * sin(v);
+        return r1;
+    }
+
+   public:
+    PhiloxRNG(uint64_t seed = 0) {
+        this->seed = seed;
+        this->offset = 0;
+    }
+
+    void manual_seed(uint64_t seed) {
+        this->seed = seed;
+        this->offset = 0;
+    }
+
+    std::vector<float> randn(uint32_t n) {
+        std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
+        for (uint32_t i = 0; i < n; i++) {
+            counter[0][i] = this->offset;
+        }
+
+        for (uint32_t i = 0; i < n; i++) {
+            counter[2][i] = i;
+        }
+        this->offset += 1;
+
+        std::vector<uint64_t> key(n, this->seed);
+        std::vector<std::vector<uint32_t>> key_uint32 = uint32(key);
+
+        std::vector<std::vector<uint32_t>> g = philox4_32(counter, key_uint32);
+
+        std::vector<float> result;
+        for (int i = 0; i < n; ++i) {
+            result.push_back(box_muller(g[0][i], g[1][i]));
+        }
+        return result;
+    }
+};
+
+#endif  // __RNG_PHILOX_H__
\ No newline at end of file
diff --git a/stable-diffusion.cpp/stable-diffusion.cpp b/stable-diffusion.cpp/stable-diffusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3da0367dfac9cece52a416f65a9a744b704ae98
--- /dev/null
+++ b/stable-diffusion.cpp/stable-diffusion.cpp
@@ -0,0 +1,4388 @@
+#include <assert.h>
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <random>
+#include <regex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "ggml/ggml.h"
+#include "rng.h"
+#include "rng_philox.h"
+#include "stable-diffusion.h"
+
+static SDLogLevel log_level = SDLogLevel::INFO;
+
+#define __FILENAME__ "stable-diffusion.cpp"
+#define SD_LOG(level, format, ...)                                                                    \
+    do {                                                                                              \
+        if (level < log_level) {                                                                      \
+            break;                                                                                    \
+        }                                                                                             \
+        if (level == SDLogLevel::DEBUG) {                                                             \
+            printf("[DEBUG] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__);          \
+            fflush(stdout);                                                                           \
+        } else if (level == SDLogLevel::INFO) {                                                       \
+            printf("[INFO]  %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__);          \
+            fflush(stdout);                                                                           \
+        } else if (level == SDLogLevel::WARN) {                                                       \
+            fprintf(stderr, "[WARN]  %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
+            fflush(stdout);                                                                           \
+        } else if (level == SDLogLevel::ERROR) {                                                      \
+            fprintf(stderr, "[ERROR] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
+            fflush(stdout);                                                                           \
+        }                                                                                             \
+    } while (0)
+
+#define LOG_DEBUG(format, ...) SD_LOG(SDLogLevel::DEBUG, format, ##__VA_ARGS__)
+#define LOG_INFO(format, ...) SD_LOG(SDLogLevel::INFO, format, ##__VA_ARGS__)
+#define LOG_WARN(format, ...) SD_LOG(SDLogLevel::WARN, format, ##__VA_ARGS__)
+#define LOG_ERROR(format, ...) SD_LOG(SDLogLevel::ERROR, format, ##__VA_ARGS__)
+
+#define GGML_FILE_MAGIC 0x67676d6c
+
+#define TIMESTEPS 1000
+
+enum ModelType {
+    SD1 = 0,
+    SD2 = 1,
+    MODEL_TYPE_COUNT,
+};
+
+const char* model_type_to_str[] = {
+    "SD1.x",
+    "SD2.x"};
+
+/*================================================== Helper Functions ================================================*/
+
+void set_sd_log_level(SDLogLevel level) {
+    log_level = level;
+}
+
+std::string sd_get_system_info() {
+    std::stringstream ss;
+    ss << "System Info: \n";
+    ss << "    BLAS = " << ggml_cpu_has_blas() << std::endl;
+    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
+    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
+    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
+    ss << "    AVX512 = " << ggml_cpu_has_avx512() << std::endl;
+    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
+    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
+    ss << "    FMA = " << ggml_cpu_has_fma() << std::endl;
+    ss << "    NEON = " << ggml_cpu_has_neon() << std::endl;
+    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
+    ss << "    F16C = " << ggml_cpu_has_f16c() << std::endl;
+    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
+    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
+    ss << "    VSX = " << ggml_cpu_has_vsx() << std::endl;
+    return ss.str();
+}
+
+ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) {
+    std::ifstream file(file_path, std::ios::binary);
+    if (!file.is_open()) {
+        LOG_ERROR("failed to open '%s'", file_path.c_str());
+        return NULL;
+    }
+    int32_t n_dims;
+    int32_t length;
+    int32_t ttype;
+
+    file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+    file.read(reinterpret_cast<char*>(&length), sizeof(length));
+    file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+
+    if (file.eof()) {
+        LOG_ERROR("incomplete file '%s'", file_path.c_str());
+        return NULL;
+    }
+
+    int32_t nelements = 1;
+    int32_t ne[4] = {1, 1, 1, 1};
+    for (int i = 0; i < n_dims; ++i) {
+        file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
+        nelements *= ne[i];
+    }
+    std::string name(length, 0);
+    file.read(&name[0], length);
+    ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]);
+    const size_t bpe = ggml_type_size(ggml_type(ttype));
+    file.read(reinterpret_cast<char*>(tensor->data), ggml_nbytes(tensor));
+    return tensor;
+}
+
+void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
+    uint32_t n = ggml_nelements(tensor);
+    std::vector<float> random_numbers = rng->randn(n);
+    for (int i = 0; i < n; i++) {
+        ggml_set_f32_1d(tensor, i, random_numbers[i]);
+    }
+}
+
+// set tensor[i, j, k, l]
+// set tensor[l]
+// set tensor[k, l]
+// set tensor[j, k, l]
+void ggml_tensor_set_f32(struct ggml_tensor* tensor, float value, int l, int k = 0, int j = 0, int i = 0) {
+    GGML_ASSERT(tensor->nb[0] == sizeof(float));
+    *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]) = value;
+}
+
+float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
+    GGML_ASSERT(tensor->nb[0] == sizeof(float));
+    return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
+}
+
+void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
+    printf("shape(%zu, %zu, %zu, %zu)\n", tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    fflush(stdout);
+    if (shape_only) {
+        return;
+    }
+    int range = 3;
+    for (int i = 0; i < tensor->ne[3]; i++) {
+        if (i >= range && i + range < tensor->ne[3]) {
+            continue;
+        }
+        for (int j = 0; j < tensor->ne[2]; j++) {
+            if (j >= range && j + range < tensor->ne[2]) {
+                continue;
+            }
+            for (int k = 0; k < tensor->ne[1]; k++) {
+                if (k >= range && k + range < tensor->ne[1]) {
+                    continue;
+                }
+                for (int l = 0; l < tensor->ne[0]; l++) {
+                    if (l >= range && l + range < tensor->ne[0]) {
+                        continue;
+                    }
+                    printf("  [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
+                    fflush(stdout);
+                }
+            }
+        }
+    }
+}
+
+void copy_ggml_tensor(
+    struct ggml_tensor* dst,
+    const struct ggml_tensor* src) {
+    dst->nb[0] = src->nb[0];
+    dst->nb[1] = src->nb[1];
+    dst->nb[2] = src->nb[2];
+    dst->nb[3] = src->nb[3];
+
+    memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
+}
+
+// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
+void set_timestep_embedding(struct ggml_tensor* timesteps, struct ggml_tensor* embedding, int dim, int max_period = 10000) {
+    // timesteps: [N,]
+    // embedding: [(dim + 1)/2, N]
+    int half = dim / 2;
+    std::vector<float> freqs(half);
+    for (int i = 0; i < half; ++i) {
+        freqs[i] = (float)std::exp(-std::log(max_period) * i / half);
+    }
+    for (int i = 0; i < timesteps->ne[0]; ++i) {
+        for (int j = 0; j < half; ++j) {
+            float arg = ggml_get_f32_1d(timesteps, i) * freqs[j];
+            ggml_tensor_set_f32(embedding, std::cos(arg), j, i);
+            ggml_tensor_set_f32(embedding, std::sin(arg), j + half, i);
+        }
+        if (dim % 2 != 0) {
+            *(float*)((char*)embedding->data + i * embedding->nb[1] + dim * embedding->nb[0]) = 0;
+        }
+    }
+}
+
+struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx, struct ggml_tensor* timesteps, int dim, int max_period = 10000) {
+    // timesteps: [N,]
+    // embedding: [(dim + 1)/2, N]
+    int acutual_dim = dim;
+    if (dim % 2 != 0) {
+        acutual_dim = dim + 1;
+    }
+    struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
+    if (!ggml_get_no_alloc(ctx)) {
+        set_timestep_embedding(timesteps, embedding, dim, max_period);
+    }
+    return embedding;
+}
+
+std::vector<uint8_t> ggml_to_image_vec(struct ggml_tensor* t) {
+    int64_t w = t->ne[0];
+    int64_t h = t->ne[1];
+    int64_t c = t->ne[2];
+    std::vector<uint8_t> vec;
+    vec.resize(w * h * c);
+    uint8_t* data = (uint8_t*)vec.data();
+    for (int i = 0; i < h; i++) {
+        for (int j = 0; j < w; j++) {
+            for (int k = 0; k < c; k++) {
+                float value = ggml_tensor_get_f32(t, j, i, k);
+                value = (value + 1.0f) * 0.5f;
+                if (value < 0) {
+                    value = 0;
+                } else if (value > 1) {
+                    value = 1;
+                }
+                value *= 255.f;
+                *(data + i * w * c + j * c + k) = (uint8_t)value;
+            }
+        }
+    }
+    return vec;
+}
+
+void image_vec_to_ggml(const std::vector<uint8_t>& vec,
+                       struct ggml_tensor* t) {
+    int64_t w = t->ne[0];
+    int64_t h = t->ne[1];
+    int64_t c = t->ne[2];
+    uint8_t* data = (uint8_t*)vec.data();
+    for (int i = 0; i < h; i++) {
+        for (int j = 0; j < w; j++) {
+            for (int k = 0; k < c; k++) {
+                float value = *(data + i * w * c + j * c + k);
+                value = value / 255.f;
+                value = 2 * value - 1;
+                ggml_tensor_set_f32(t, value, j, i, k);
+            }
+        }
+    }
+}
+
+struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
+                                       struct ggml_tensor* a) {
+    return ggml_group_norm(ctx, a, 32);
+}
+
+/*================================================== CLIPTokenizer ===================================================*/
+
+const std::string UNK_TOKEN = "<|endoftext|>";
+const std::string BOS_TOKEN = "<|startoftext|>";
+const std::string EOS_TOKEN = "<|endoftext|>";
+const std::string PAD_TOEKN = "<|endoftext|>";
+
+const int UNK_TOKEN_ID = 49407;
+const int BOS_TOKEN_ID = 49406;
+const int EOS_TOKEN_ID = 49407;
+const int PAD_TOKEN_ID = 49407;
+
+// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
+// TODO: implement bpe
+class CLIPTokenizer {
+   private:
+    ModelType model_type = SD1;
+    std::map<std::string, int32_t> encoder;
+    std::regex pat;
+
+    static std::string strip(const std::string& str) {
+        std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
+        std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
+
+        if (start == std::string::npos) {
+            // String contains only whitespace characters
+            return "";
+        }
+
+        return str.substr(start, end - start + 1);
+    }
+
+    static std::string whitespace_clean(std::string text) {
+        text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
+        text = strip(text);
+        return text;
+    }
+
+   public:
+    CLIPTokenizer(ModelType model_type = SD1)
+        : model_type(model_type){};
+    std::string bpe(std::string token) {
+        std::string word = token + "</w>";
+        if (encoder.find(word) != encoder.end()) {
+            return word;
+        } else if (encoder.find(token) != encoder.end()) {
+            return token;
+        }
+        return UNK_TOKEN;
+    }
+
+    void add_token(std::string token, int32_t token_id) {
+        encoder[token] = token_id;
+    }
+
+    std::vector<int> tokenize(std::string text, size_t max_length = 0, bool padding = false) {
+        std::vector<int32_t> tokens = encode(text);
+        tokens.insert(tokens.begin(), BOS_TOKEN_ID);
+        if (max_length > 0) {
+            if (tokens.size() > max_length - 1) {
+                tokens.resize(max_length - 1);
+                tokens.push_back(EOS_TOKEN_ID);
+            } else {
+                tokens.push_back(EOS_TOKEN_ID);
+                if (padding) {
+                    int pad_token_id = PAD_TOKEN_ID;
+                    if (model_type == SD2) {
+                        pad_token_id = 0;
+                    }
+                    tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
+                }
+            }
+        }
+        return tokens;
+    }
+
+    std::vector<int> encode(std::string text) {
+        std::string original_text = text;
+        std::vector<int32_t> bpe_tokens;
+        text = whitespace_clean(text);
+        std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
+
+        std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
+                       std::regex::icase);
+
+        std::smatch matches;
+        std::string str = text;
+        std::vector<std::string> token_strs;
+        while (std::regex_search(str, matches, pat)) {
+            for (auto& token : matches) {
+                std::istringstream iss(bpe(token));
+                std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
+                                                std::istream_iterator<std::string>{}};
+                for (const auto& bpe_token : tokens) {
+                    bpe_tokens.push_back(encoder[bpe_token]);
+                    token_strs.push_back(bpe_token);
+                }
+            }
+            str = matches.suffix();
+        }
+        std::stringstream ss;
+        ss << "[";
+        for (auto token : token_strs) {
+            ss << "\"" << token << "\", ";
+        }
+        ss << "]";
+        LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
+        return bpe_tokens;
+    }
+};
+
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
+//
+// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+// Accepted tokens are:
+//   (abc) - increases attention to abc by a multiplier of 1.1
+//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
+//   [abc] - decreases attention to abc by a multiplier of 1.1
+//   \( - literal character '('
+//   \[ - literal character '['
+//   \) - literal character ')'
+//   \] - literal character ']'
+//   \\ - literal character '\'
+//   anything else - just text
+//
+// >>> parse_prompt_attention('normal text')
+// [['normal text', 1.0]]
+// >>> parse_prompt_attention('an (important) word')
+// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+// >>> parse_prompt_attention('(unbalanced')
+// [['unbalanced', 1.1]]
+// >>> parse_prompt_attention('\(literal\]')
+// [['(literal]', 1.0]]
+// >>> parse_prompt_attention('(unnecessary)(parens)')
+// [['unnecessaryparens', 1.1]]
+// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+// [['a ', 1.0],
+//  ['house', 1.5730000000000004],
+//  [' ', 1.1],
+//  ['on', 1.0],
+//  [' a ', 1.1],
+//  ['hill', 0.55],
+//  [', sun, ', 1.1],
+//  ['sky', 1.4641000000000006],
+//  ['.', 1.1]]
+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
+    std::vector<std::pair<std::string, float>> res;
+    std::vector<int> round_brackets;
+    std::vector<int> square_brackets;
+
+    float round_bracket_multiplier = 1.1f;
+    float square_bracket_multiplier = 1 / 1.1f;
+
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
+    std::regex re_break(R"(\s*\bBREAK\b\s*)");
+
+    auto multiply_range = [&](int start_position, float multiplier) {
+        for (int p = start_position; p < res.size(); ++p) {
+            res[p].second *= multiplier;
+        }
+    };
+
+    std::smatch m;
+    std::string remaining_text = text;
+
+    while (std::regex_search(remaining_text, m, re_attention)) {
+        std::string text = m[0];
+        std::string weight = m[1];
+
+        if (text == "(") {
+            round_brackets.push_back(res.size());
+        } else if (text == "[") {
+            square_brackets.push_back(res.size());
+        } else if (!weight.empty()) {
+            if (!round_brackets.empty()) {
+                multiply_range(round_brackets.back(), std::stod(weight));
+                round_brackets.pop_back();
+            }
+        } else if (text == ")" && !round_brackets.empty()) {
+            multiply_range(round_brackets.back(), round_bracket_multiplier);
+            round_brackets.pop_back();
+        } else if (text == "]" && !square_brackets.empty()) {
+            multiply_range(square_brackets.back(), square_bracket_multiplier);
+            square_brackets.pop_back();
+        } else if (text == "\\(") {
+            res.push_back({text.substr(1), 1.0f});
+        } else {
+            res.push_back({text, 1.0f});
+        }
+
+        remaining_text = m.suffix();
+    }
+
+    for (int pos : round_brackets) {
+        multiply_range(pos, round_bracket_multiplier);
+    }
+
+    for (int pos : square_brackets) {
+        multiply_range(pos, square_bracket_multiplier);
+    }
+
+    if (res.empty()) {
+        res.push_back({"", 1.0f});
+    }
+
+    int i = 0;
+    while (i + 1 < res.size()) {
+        if (res[i].second == res[i + 1].second) {
+            res[i].first += res[i + 1].first;
+            res.erase(res.begin() + i + 1);
+        } else {
+            ++i;
+        }
+    }
+
+    return res;
+}
+
+/*================================================ FrozenCLIPEmbedder ================================================*/
+
+struct ResidualAttentionBlock {
+    int32_t n_head;
+    int32_t d_model;
+    int32_t hidden_size;  // n_head * d_model
+    int32_t intermediate_size;
+
+    // attention
+    struct ggml_tensor* q_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* q_b;  // [hidden_size, ]
+    struct ggml_tensor* k_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* k_b;  // [hidden_size, ]
+    struct ggml_tensor* v_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* v_b;  // [hidden_size, ]
+
+    struct ggml_tensor* out_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* out_b;  // [hidden_size, ]
+
+    // layer norm 1
+    struct ggml_tensor* ln1_w;  // [hidden_size, ]
+    struct ggml_tensor* ln1_b;  // [hidden_size, ]
+
+    // mlp
+    struct ggml_tensor* fc1_w;  // [intermediate_size, hidden_size]
+    struct ggml_tensor* fc1_b;  // [intermediate_size, ]
+
+    struct ggml_tensor* fc2_w;  // [hidden_size, intermediate_size]
+    struct ggml_tensor* fc2_b;  // [hidden_size, ]
+
+    // layer norm 2
+    struct ggml_tensor* ln2_w;  // [hidden_size, ]
+    struct ggml_tensor* ln2_b;  // [hidden_size, ]
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 4 * hidden_size * hidden_size * ggml_type_sizef(wtype);        // q_w/k_w/v_w/out_w
+        mem_size += 8 * hidden_size * ggml_type_sizef(GGML_TYPE_F32);              // q_b/k_b/v_b/out_b/ln1_w/ln1_b/ln2_w/ln2_b
+        mem_size += 2 * hidden_size * intermediate_size * ggml_type_sizef(wtype);  // fc1_w/fc2_w
+        mem_size += intermediate_size * ggml_type_sizef(GGML_TYPE_F32);            // fc1_b
+        mem_size += hidden_size * ggml_type_sizef(GGML_TYPE_F32);                  // fc2_b
+        mem_size += 16 * ggml_tensor_overhead();                                   // tensor overhead
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        ln1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        ln1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        out_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        fc1_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, intermediate_size);
+        fc1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, intermediate_size);
+
+        fc2_w = ggml_new_tensor_2d(ctx, wtype, intermediate_size, hidden_size);
+        fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "self_attn.q_proj.weight"] = q_w;
+        tensors[prefix + "self_attn.q_proj.bias"] = q_b;
+        tensors[prefix + "self_attn.k_proj.weight"] = k_w;
+        tensors[prefix + "self_attn.k_proj.bias"] = k_b;
+        tensors[prefix + "self_attn.v_proj.weight"] = v_w;
+        tensors[prefix + "self_attn.v_proj.bias"] = v_b;
+        tensors[prefix + "self_attn.out_proj.weight"] = out_w;
+        tensors[prefix + "self_attn.out_proj.bias"] = out_b;
+
+        tensors[prefix + "layer_norm1.weight"] = ln1_w;
+        tensors[prefix + "layer_norm1.bias"] = ln1_b;
+
+        tensors[prefix + "layer_norm2.weight"] = ln2_w;
+        tensors[prefix + "layer_norm2.bias"] = ln2_b;
+
+        tensors[prefix + "mlp.fc1.weight"] = fc1_w;
+        tensors[prefix + "mlp.fc1.bias"] = fc1_b;
+
+        tensors[prefix + "mlp.fc2.weight"] = fc2_w;
+        tensors[prefix + "mlp.fc2.bias"] = fc2_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, hidden_size]
+        int64_t N = x->ne[2];
+        int64_t n_token = x->ne[1];
+        int64_t hidden_size = n_head * d_model;
+
+        struct ggml_tensor* r = x;
+
+        // layer norm 1
+        {
+            x = ggml_norm(ctx, x, 1e-6f);
+            x = ggml_add(ctx,
+                         ggml_mul(ctx, ggml_repeat(ctx, ln1_w, x), x),
+                         ggml_repeat(ctx, ln1_b, x));
+        }
+        // self-attention
+        {
+            struct ggml_tensor* q = ggml_add(ctx,
+                                             ggml_repeat(ctx, q_b, x),
+                                             ggml_mul_mat(ctx, q_w, x));
+            q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_model)));
+            q = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
+            q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));       // [N, n_head, n_token, d_model]
+            q = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N);  // [N * n_head, n_token, d_model]
+
+            struct ggml_tensor* k = ggml_add(ctx,
+                                             ggml_repeat(ctx, k_b, x),
+                                             ggml_mul_mat(ctx, k_w, x));
+            k = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N);  // [N, n_token, n_head, d_model]
+            k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));      // [N, n_head, n_token, d_model]
+            k = ggml_reshape_3d(ctx, k, d_model, n_token, n_head);     // [N * n_head, n_token, d_model]
+
+            struct ggml_tensor* v = ggml_add(ctx,
+                                             ggml_repeat(ctx, v_b, x),
+                                             ggml_mul_mat(ctx, v_w, x));
+            v = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
+            v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));       // [N, n_head, d_model, n_token]
+            v = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N);  // [N * n_head, d_model, n_token]
+
+            struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, n_token, n_token]
+
+            kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
+            kq = ggml_soft_max_inplace(ctx, kq);
+
+            struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, n_token, d_model]
+            kqv = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N);
+            kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_model]
+
+            x = ggml_reshape_2d(ctx, kqv, d_model * n_head, n_token * N);  // // [N * n_token, d_model * n_head]
+        }
+
+        // attention output
+        x = ggml_add(ctx, ggml_repeat(ctx, out_b, x), ggml_mul_mat(ctx, out_w, x));
+
+        // residual
+        x = ggml_add(ctx, x, r);
+        r = x;
+
+        // layer norm 2
+        {
+            x = ggml_norm(ctx, x, 1e-6f);
+
+            x = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ln2_w, x), x),
+                         ggml_repeat(ctx, ln2_b, x));
+        }
+
+        // mlp
+        x = ggml_mul_mat(ctx, fc1_w, x);
+        x = ggml_add(ctx, ggml_repeat(ctx, fc1_b, x), x);
+
+        if (hidden_size == 1024) {  // SD 2.x
+            x = ggml_gelu_inplace(ctx, x);
+        } else {  // SD 1.x
+            x = ggml_gelu_quick_inplace(ctx, x);
+        }
+
+        x = ggml_mul_mat(ctx, fc2_w, x);
+        x = ggml_add(ctx, ggml_repeat(ctx, fc2_b, x), x);
+
+        // residual 2
+        x = ggml_add(ctx, x, r);
+
+        return x;
+    }
+};
+
+// SD1.x: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
+// SD2.x: https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/config.json
+struct CLIPTextModel {
+    ModelType model_type = SD1;
+    // network hparams
+    int32_t vocab_size = 49408;
+    int32_t max_position_embeddings = 77;
+    int32_t hidden_size = 768;         // 1024 for SD 2.x
+    int32_t intermediate_size = 3072;  // 4096 for SD 2.x
+    int32_t n_head = 12;               // num_attention_heads, 16 for SD 2.x
+    int32_t num_hidden_layers = 12;    // 24 for SD 2.x
+
+    // embeddings
+    struct ggml_tensor* position_ids;
+    struct ggml_tensor* token_embed_weight;
+    struct ggml_tensor* position_embed_weight;
+    // transformer
+    std::vector<ResidualAttentionBlock> resblocks;
+    struct ggml_tensor* final_ln_w;
+    struct ggml_tensor* final_ln_b;
+
+    CLIPTextModel(ModelType model_type = SD1)
+        : model_type(model_type) {
+        if (model_type == SD2) {
+            hidden_size = 1024;
+            intermediate_size = 4096;
+            n_head = 16;
+            num_hidden_layers = 24;
+        }
+        resblocks.resize(num_hidden_layers);
+        set_resblocks_hp_params();
+    }
+
+    void set_resblocks_hp_params() {
+        int d_model = hidden_size / n_head;  // 64
+        for (int i = 0; i < num_hidden_layers; i++) {
+            resblocks[i].d_model = d_model;
+            resblocks[i].n_head = n_head;
+            resblocks[i].hidden_size = hidden_size;
+            resblocks[i].intermediate_size = intermediate_size;
+        }
+    }
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(GGML_TYPE_I32);  // position_ids
+        mem_size += hidden_size * vocab_size * ggml_type_sizef(wtype);                       // token_embed_weight
+        mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(wtype);          // position_embed_weight
+        for (int i = 0; i < num_hidden_layers; i++) {
+            mem_size += resblocks[i].compute_params_mem_size(wtype);
+        }
+        mem_size += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32);  // final_ln_w/b
+        mem_size += ggml_tensor_overhead();                            // object overhead
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, max_position_embeddings);
+        for (int i = 0; i < max_position_embeddings; i++) {
+            ggml_set_i32_1d(position_ids, i, i);
+        }
+        token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size);
+        position_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, max_position_embeddings);
+
+        for (int i = 0; i < num_hidden_layers; i++) {
+            resblocks[i].init_params(ctx, wtype);
+        }
+
+        final_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        final_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "embeddings.token_embedding.weight"] = token_embed_weight;
+        tensors[prefix + "embeddings.position_embedding.weight"] = position_embed_weight;
+        tensors[prefix + "final_layer_norm.weight"] = final_ln_w;
+        tensors[prefix + "final_layer_norm.bias"] = final_ln_b;
+        for (int i = 0; i < num_hidden_layers; i++) {
+            resblocks[i].map_by_name(tensors, prefix + "encoder.layers." + std::to_string(i) + ".");
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* input_ids) {
+        // input_ids: [N, n_token]
+        GGML_ASSERT(input_ids->ne[0] <= position_ids->ne[0]);
+
+        // token_embedding + position_embedding
+        struct ggml_tensor* x;
+        x = ggml_add(ctx,
+                     ggml_get_rows(ctx, token_embed_weight, input_ids),
+                     ggml_get_rows(ctx,
+                                   position_embed_weight,
+                                   ggml_view_1d(ctx, position_ids, input_ids->ne[0], 0)));  // [N, n_token, hidden_size]
+
+        // transformer
+        for (int i = 0; i < num_hidden_layers; i++) {
+            if (model_type == SD2 && i == num_hidden_layers - 1) {  // layer: "penultimate"
+                break;
+            }
+            x = resblocks[i].forward(ctx, x);  // [N, n_token, hidden_size]
+        }
+
+        // final layer norm
+        {
+            x = ggml_norm(ctx, x, 1e-6f);
+
+            x = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, final_ln_w, x), x),
+                         ggml_repeat(ctx, final_ln_b, x));
+        }
+
+        return x;  // [N, n_token, hidden_size]
+    }
+};
+
+// ldm.modules.encoders.modules.FrozenCLIPEmbedder
+struct FrozenCLIPEmbedder {
+    CLIPTokenizer tokenizer;
+    CLIPTextModel text_model;
+    struct ggml_tensor* forward(struct ggml_context* ctx, const std::string& prompt) {
+        std::vector<int32_t> tokens = tokenizer.tokenize(prompt, text_model.max_position_embeddings, true);
+        struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
+        memcpy(input_ids->data, tokens.data(), tokens.size() * ggml_element_size(input_ids));
+        struct ggml_tensor* hidden_states = text_model.forward(ctx, input_ids);
+        return hidden_states;
+    }
+};
+
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
+struct FrozenCLIPEmbedderWithCustomWords {
+    ModelType model_type = SD1;
+    CLIPTokenizer tokenizer;
+    CLIPTextModel text_model;
+
+    FrozenCLIPEmbedderWithCustomWords(ModelType model_type = SD1)
+        : model_type(model_type), tokenizer(model_type), text_model(model_type) {}
+
+    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
+                                                             size_t max_length = 0,
+                                                             bool padding = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight = item.second;
+            std::vector<int> curr_tokens = tokenizer.encode(curr_text);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+        tokens.insert(tokens.begin(), BOS_TOKEN_ID);
+        weights.insert(weights.begin(), 1.0);
+
+        if (max_length > 0) {
+            if (tokens.size() > max_length - 1) {
+                tokens.resize(max_length - 1);
+                weights.resize(max_length - 1);
+                tokens.push_back(EOS_TOKEN_ID);
+                weights.push_back(1.0);
+            } else {
+                tokens.push_back(EOS_TOKEN_ID);
+                weights.push_back(1.0);
+                if (padding) {
+                    int pad_token_id = PAD_TOKEN_ID;
+                    if (model_type == SD2) {
+                        pad_token_id = 0;
+                    }
+                    tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
+                    weights.insert(weights.end(), max_length - weights.size(), 1.0);
+                }
+            }
+        }
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights};
+    }
+};
+
+/*==================================================== UnetModel =====================================================*/
+
+struct ResBlock {
+    // network hparams
+    int channels;      // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
+    int emb_channels;  // time_embed_dim
+    int out_channels;  // mult * model_channels
+
+    // network params
+    // in_layers
+    struct ggml_tensor* in_layer_0_w;  // [channels, ]
+    struct ggml_tensor* in_layer_0_b;  // [channels, ]
+    // in_layer_1 is nn.SILU()
+    struct ggml_tensor* in_layer_2_w;  // [out_channels, channels, 3, 3]
+    struct ggml_tensor* in_layer_2_b;  // [out_channels, ]
+
+    // emb_layers
+    // emb_layer_0 is nn.SILU()
+    struct ggml_tensor* emb_layer_1_w;  // [out_channels, emb_channels]
+    struct ggml_tensor* emb_layer_1_b;  // [out_channels, ]
+
+    // out_layers
+    struct ggml_tensor* out_layer_0_w;  // [out_channels, ]
+    struct ggml_tensor* out_layer_0_b;  // [out_channels, ]
+    // out_layer_1 is nn.SILU()
+    // out_layer_2 is nn.Dropout(), p = 0 for inference
+    struct ggml_tensor* out_layer_3_w;  // [out_channels, out_channels, 3, 3]
+    struct ggml_tensor* out_layer_3_b;  // [out_channels, ]
+
+    // skip connection, only if out_channels != channels
+    struct ggml_tensor* skip_w;  // [out_channels, channels, 1, 1]
+    struct ggml_tensor* skip_b;  // [out_channels, ]
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 2 * channels * ggml_type_sizef(GGML_TYPE_F32);                         // in_layer_0_w/b
+        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);      // in_layer_2_w
+        mem_size += 5 * out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // in_layer_2_b/emb_layer_1_b/out_layer_0_w/out_layer_0_b/out_layer_3_b
+        mem_size += out_channels * emb_channels * ggml_type_sizef(wtype);                  // emb_layer_1_w
+        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // out_layer_3_w
+
+        mem_size += 10 * ggml_tensor_overhead();  // object overhead
+
+        if (out_channels != channels) {
+            mem_size += out_channels * channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // skip_w
+            mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // skip_b
+
+            mem_size += 2 * ggml_tensor_overhead();  // object overhead
+        }
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        in_layer_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
+        in_layer_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
+        in_layer_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
+        in_layer_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        emb_layer_1_w = ggml_new_tensor_2d(ctx, wtype, emb_channels, out_channels);
+        emb_layer_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        out_layer_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        out_layer_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        out_layer_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
+        out_layer_3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        if (out_channels != channels) {
+            skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, channels, out_channels);
+            skip_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        }
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "in_layers.0.weight"] = in_layer_0_w;
+        tensors[prefix + "in_layers.0.bias"] = in_layer_0_b;
+        tensors[prefix + "in_layers.2.weight"] = in_layer_2_w;
+        tensors[prefix + "in_layers.2.bias"] = in_layer_2_b;
+
+        tensors[prefix + "emb_layers.1.weight"] = emb_layer_1_w;
+        tensors[prefix + "emb_layers.1.bias"] = emb_layer_1_b;
+
+        tensors[prefix + "out_layers.0.weight"] = out_layer_0_w;
+        tensors[prefix + "out_layers.0.bias"] = out_layer_0_b;
+        tensors[prefix + "out_layers.3.weight"] = out_layer_3_w;
+        tensors[prefix + "out_layers.3.bias"] = out_layer_3_b;
+
+        if (out_channels != channels) {
+            tensors[prefix + "skip_connection.weight"] = skip_w;
+            tensors[prefix + "skip_connection.bias"] = skip_b;
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb) {
+        // x: [N, channels, h, w]
+        // emb: [N, emb_channels]
+
+        // in_layers
+        // group norm 32
+        auto h = ggml_group_norm_32(ctx, x);
+        h = ggml_add(ctx,
+                     ggml_mul(ctx,
+                              ggml_repeat(ctx,
+                                          ggml_reshape_4d(ctx, in_layer_0_w, 1, 1, in_layer_0_w->ne[0], 1),
+                                          h),
+                              h),
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, in_layer_0_b, 1, 1, in_layer_0_b->ne[0], 1),
+                                 h));
+        // silu
+        h = ggml_silu_inplace(ctx, h);
+        // conv2d
+        h = ggml_conv_2d(ctx, in_layer_2_w, h, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, in_layer_2_b, 1, 1, in_layer_2_b->ne[0], 1),
+                                 h));  // [N, out_channels, h, w]
+
+        // emb_layers
+        auto emb_out = ggml_silu(ctx, emb);
+        emb_out = ggml_mul_mat(ctx, emb_layer_1_w, emb_out);
+        emb_out = ggml_add(ctx, ggml_repeat(ctx, emb_layer_1_b, emb_out), emb_out);     // [N, out_channels]
+        emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+        emb_out = ggml_repeat(ctx, emb_out, h);                                         // [N, out_channels, h, w]
+
+        // out_layers
+        h = ggml_add(ctx, h, emb_out);
+        // group norm 32
+        h = ggml_group_norm_inplace(ctx, h, 32);
+        h = ggml_add(ctx,
+                     ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, out_layer_0_w, 1, 1, out_layer_0_w->ne[0], 1), h), h),
+                     ggml_repeat(ctx, ggml_reshape_4d(ctx, out_layer_0_b, 1, 1, out_layer_0_b->ne[0], 1), h));
+        // silu
+        h = ggml_silu_inplace(ctx, h);
+        // dropout, skip for inference
+        // conv2d
+        h = ggml_conv_2d(ctx, out_layer_3_w, h, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, out_layer_3_b, 1, 1, out_layer_3_b->ne[0], 1),
+                                 h));  // [N, out_channels, h, w
+
+        // skip connection
+        if (out_channels != channels) {
+            x = ggml_conv_2d(ctx, skip_w, x, 1, 1, 0, 0, 1, 1);
+            x = ggml_add(ctx,
+                         x,
+                         ggml_repeat(ctx,
+                                     ggml_reshape_4d(ctx, skip_b, 1, 1, skip_b->ne[0], 1),
+                                     x));  // [N, out_channels, h, w]
+        }
+        h = ggml_add(ctx, h, x);
+        return h;  // [N, out_channels, h, w]
+    }
+};
+
+struct SpatialTransformer {
+    int in_channels;        // mult * model_channels
+    int n_head;             // num_heads
+    int d_head;             // in_channels // n_heads
+    int depth = 1;          // 1
+    int context_dim = 768;  // hidden_size, 1024 for SD2.x
+
+    // group norm
+    struct ggml_tensor* norm_w;  // [in_channels,]
+    struct ggml_tensor* norm_b;  // [in_channels,]
+
+    // proj_in
+    struct ggml_tensor* proj_in_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* proj_in_b;  // [in_channels,]
+
+    // transformer
+    struct
+    {
+        // layer norm 1
+        struct ggml_tensor* norm1_w;  // [in_channels, ]
+        struct ggml_tensor* norm1_b;  // [in_channels, ]
+
+        // attn1
+        struct ggml_tensor* attn1_q_w;  // [in_channels, in_channels]
+        struct ggml_tensor* attn1_k_w;  // [in_channels, in_channels]
+        struct ggml_tensor* attn1_v_w;  // [in_channels, in_channels]
+
+        struct ggml_tensor* attn1_out_w;  // [in_channels, in_channels]
+        struct ggml_tensor* attn1_out_b;  // [in_channels, ]
+
+        // layer norm 2
+        struct ggml_tensor* norm2_w;  // [in_channels, ]
+        struct ggml_tensor* norm2_b;  // [in_channels, ]
+
+        // attn2
+        struct ggml_tensor* attn2_q_w;  // [in_channels, in_channels]
+        struct ggml_tensor* attn2_k_w;  // [in_channels, context_dim]
+        struct ggml_tensor* attn2_v_w;  // [in_channels, context_dim]
+
+        struct ggml_tensor* attn2_out_w;  // [in_channels, in_channels]
+        struct ggml_tensor* attn2_out_b;  // [in_channels, ]
+
+        // layer norm 3
+        struct ggml_tensor* norm3_w;  // [in_channels, ]
+        struct ggml_tensor* norm3_b;  // [in_channels, ]
+
+        // ff
+        struct ggml_tensor* ff_0_proj_w;  // [in_channels * 4 * 2, in_channels]
+        struct ggml_tensor* ff_0_proj_b;  // [in_channels * 4 * 2]
+
+        struct ggml_tensor* ff_2_w;  // [in_channels, in_channels * 4]
+        struct ggml_tensor* ff_2_b;  // [in_channels,]
+    } transformer;
+
+    // proj_out
+    struct ggml_tensor* proj_out_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* proj_out_b;  // [in_channels,]
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                        // norm_w/norm_b
+        mem_size += 2 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // proj_in_w/proj_out_w
+        mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                        // proj_in_b/proj_out_b
+
+        // transformer
+        {
+            mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32);            // norm1-3_w/b
+            mem_size += 6 * in_channels * in_channels * ggml_type_sizef(wtype);      // attn1_q/k/v/out_w attn2_q/out_w
+            mem_size += 2 * in_channels * context_dim * ggml_type_sizef(wtype);      // attn2_k/v_w
+            mem_size += in_channels * 4 * 2 * in_channels * ggml_type_sizef(wtype);  // ff_0_proj_w
+            mem_size += in_channels * 4 * 2 * ggml_type_sizef(GGML_TYPE_F32);        // ff_0_proj_b
+            mem_size += in_channels * 4 * in_channels * ggml_type_sizef(wtype);      // ff_2_w
+            mem_size += in_channels * ggml_type_sizef(GGML_TYPE_F32);                // ff_2_b
+        }
+        mem_size += 26 * ggml_tensor_overhead();  // object overhead
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        proj_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        proj_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        // transformer
+        transformer.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        transformer.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        transformer.attn1_q_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
+        transformer.attn1_k_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
+        transformer.attn1_v_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
+
+        transformer.attn1_out_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
+        transformer.attn1_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        transformer.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        transformer.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        transformer.attn2_q_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
+        transformer.attn2_k_w = ggml_new_tensor_2d(ctx, wtype, context_dim, in_channels);
+        transformer.attn2_v_w = ggml_new_tensor_2d(ctx, wtype, context_dim, in_channels);
+
+        transformer.attn2_out_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
+        transformer.attn2_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        transformer.norm3_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        transformer.norm3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        transformer.ff_0_proj_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels * 4 * 2);
+        transformer.ff_0_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels * 4 * 2);
+
+        transformer.ff_2_w = ggml_new_tensor_2d(ctx, wtype, in_channels * 4, in_channels);
+        transformer.ff_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm.weight"] = norm_w;
+        tensors[prefix + "norm.bias"] = norm_b;
+        tensors[prefix + "proj_in.weight"] = proj_in_w;
+        tensors[prefix + "proj_in.bias"] = proj_in_b;
+
+        // transformer
+        {
+            std::string transformer_prefix = prefix + "transformer_blocks.0.";
+            tensors[transformer_prefix + "attn1.to_q.weight"] = transformer.attn1_q_w;
+            tensors[transformer_prefix + "attn1.to_k.weight"] = transformer.attn1_k_w;
+            tensors[transformer_prefix + "attn1.to_v.weight"] = transformer.attn1_v_w;
+
+            tensors[transformer_prefix + "attn1.to_out.0.weight"] = transformer.attn1_out_w;
+            tensors[transformer_prefix + "attn1.to_out.0.bias"] = transformer.attn1_out_b;
+
+            tensors[transformer_prefix + "ff.net.0.proj.weight"] = transformer.ff_0_proj_w;
+            tensors[transformer_prefix + "ff.net.0.proj.bias"] = transformer.ff_0_proj_b;
+            tensors[transformer_prefix + "ff.net.2.weight"] = transformer.ff_2_w;
+            tensors[transformer_prefix + "ff.net.2.bias"] = transformer.ff_2_b;
+
+            tensors[transformer_prefix + "attn2.to_q.weight"] = transformer.attn2_q_w;
+            tensors[transformer_prefix + "attn2.to_k.weight"] = transformer.attn2_k_w;
+            tensors[transformer_prefix + "attn2.to_v.weight"] = transformer.attn2_v_w;
+
+            tensors[transformer_prefix + "attn2.to_out.0.weight"] = transformer.attn2_out_w;
+            tensors[transformer_prefix + "attn2.to_out.0.bias"] = transformer.attn2_out_b;
+
+            tensors[transformer_prefix + "norm1.weight"] = transformer.norm1_w;
+            tensors[transformer_prefix + "norm1.bias"] = transformer.norm1_b;
+            tensors[transformer_prefix + "norm2.weight"] = transformer.norm2_w;
+            tensors[transformer_prefix + "norm2.bias"] = transformer.norm2_b;
+            tensors[transformer_prefix + "norm3.weight"] = transformer.norm3_w;
+            tensors[transformer_prefix + "norm3.bias"] = transformer.norm3_b;
+        }
+
+        tensors[prefix + "proj_out.weight"] = proj_out_w;
+        tensors[prefix + "proj_out.bias"] = proj_out_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+        // x: [N, in_channels, h, w]
+        // context: [N, max_position, hidden_size(aka context_dim)]
+
+        auto x_in = x;
+        // group norm 32
+        x = ggml_group_norm_32(ctx, x);
+        x = ggml_add(ctx,
+                     ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), x), x),
+                     ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), x));
+        // proj_in
+        x = ggml_conv_2d(ctx, proj_in_w, x, 1, 1, 0, 0, 1, 1);
+        x = ggml_add(ctx,
+                     x,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, proj_in_b, 1, 1, proj_in_b->ne[0], 1),
+                                 x));  // [N, in_channels, h, w]
+
+        // transformer
+        const int64_t n = x->ne[3];
+        const int64_t c = x->ne[2];
+        const int64_t h = x->ne[1];
+        const int64_t w = x->ne[0];
+        const int64_t max_position = context->ne[1];
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, in_channels]
+
+        {
+            auto r = x;
+            // layer norm 1
+            {
+                x = ggml_reshape_2d(ctx, x, c, w * h * n);
+                x = ggml_norm(ctx, x, 1e-6f);
+                x = ggml_add(ctx,
+                             ggml_mul(ctx,
+                                      ggml_repeat(ctx, transformer.norm1_w, x),
+                                      x),
+                             ggml_repeat(ctx, transformer.norm1_b, x));
+            }
+
+            // self-attention
+            {
+                x = ggml_reshape_2d(ctx, x, c, h * w * n);                            // [N * h * w, in_channels]
+                struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn1_q_w, x);  // [N * h * w, in_channels]
+                q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head)));
+                q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
+                q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]
+                q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n);  // [N * n_head, h * w, d_head]
+
+                struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn1_k_w, x);  // [N * h * w, in_channels]
+                k = ggml_reshape_4d(ctx, k, d_head, n_head, h * w, n);                // [N, h * w, n_head, d_head]
+                k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));                 // [N, n_head, h * w, d_head]
+                k = ggml_reshape_3d(ctx, k, d_head, h * w, n_head * n);               // [N * n_head, h * w, d_head]
+
+                struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn1_v_w, x);  // [N * h * w, in_channels]
+                v = ggml_reshape_4d(ctx, v, d_head, n_head, h * w, n);                // [N, h * w, n_head, d_head]
+                v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));                 // [N, n_head, d_head, h * w]
+                v = ggml_reshape_3d(ctx, v, h * w, d_head, n_head * n);               // [N * n_head, d_head, h * w]
+
+                struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, h * w, h * w]
+                // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
+                kq = ggml_soft_max_inplace(ctx, kq);
+
+                struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, h * w, d_head]
+                kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
+                kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, h * w, n_head, d_head]
+
+                // x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n));
+                x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n);
+
+                x = ggml_add(ctx, ggml_repeat(ctx, transformer.attn1_out_b, x), ggml_mul_mat(ctx, transformer.attn1_out_w, x));
+
+                x = ggml_reshape_4d(ctx, x, c, w, h, n);
+            }
+
+            x = ggml_add(ctx, x, r);
+            r = x;
+
+            // layer norm 2
+            {
+                x = ggml_norm(ctx, x, 1e-6f);
+                x = ggml_add(ctx,
+                             ggml_mul(ctx,
+                                      ggml_repeat(ctx, transformer.norm2_w, x), x),
+                             ggml_repeat(ctx, transformer.norm2_b, x));
+            }
+
+            // cross-attention
+            {
+                x = ggml_reshape_2d(ctx, x, c, h * w * n);                                                 // [N * h * w, in_channels]
+                context = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]);  // [N * max_position, hidden_size]
+                struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x);                       // [N * h * w, in_channels]
+
+                q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head)));
+                q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
+                q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]
+                q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n);  // [N * n_head, h * w, d_head]
+
+                struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn2_k_w, context);  // [N * max_position, in_channels]
+                k = ggml_reshape_4d(ctx, k, d_head, n_head, max_position, n);               // [N, max_position, n_head, d_head]
+                k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));                       // [N, n_head, max_position, d_head]
+                k = ggml_reshape_3d(ctx, k, d_head, max_position, n_head * n);              // [N * n_head, max_position, d_head]
+
+                struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn2_v_w, context);  // [N * max_position, in_channels]
+                v = ggml_reshape_4d(ctx, v, d_head, n_head, max_position, n);               // [N, max_position, n_head, d_head]
+                v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));                       // [N, n_head, d_head, max_position]
+                v = ggml_reshape_3d(ctx, v, max_position, d_head, n_head * n);              // [N * n_head, d_head, max_position]
+
+                struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, h * w, max_position]
+                // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
+                kq = ggml_soft_max_inplace(ctx, kq);
+
+                struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, h * w, d_head]
+
+                kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
+                kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));
+
+                // x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n)); // [N * h * w, in_channels]
+                x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n);  // [N * h * w, in_channels]
+
+                x = ggml_add(ctx, ggml_repeat(ctx, transformer.attn2_out_b, x), ggml_mul_mat(ctx, transformer.attn2_out_w, x));
+
+                x = ggml_reshape_4d(ctx, x, c, w, h, n);
+            }
+
+            x = ggml_add(ctx, x, r);
+            r = x;
+
+            // layer norm 3
+            {
+                x = ggml_reshape_2d(ctx, x, c, h * w * n);  // [N * h * w, in_channels]
+                x = ggml_norm(ctx, x, 1e-6f);
+                x = ggml_add(ctx,
+                             ggml_mul(ctx,
+                                      ggml_repeat(ctx, transformer.norm3_w, x), x),
+                             ggml_repeat(ctx, transformer.norm3_b, x));
+            }
+
+            // ff
+            {
+                // GEGLU
+                auto x_w = ggml_view_2d(ctx,
+                                        transformer.ff_0_proj_w,
+                                        transformer.ff_0_proj_w->ne[0],
+                                        transformer.ff_0_proj_w->ne[1] / 2,
+                                        transformer.ff_0_proj_w->nb[1],
+                                        0);  // [in_channels * 4, in_channels]
+                auto x_b = ggml_view_1d(ctx,
+                                        transformer.ff_0_proj_b,
+                                        transformer.ff_0_proj_b->ne[0] / 2,
+                                        0);  // [in_channels * 4, in_channels]
+                auto gate_w = ggml_view_2d(ctx,
+                                           transformer.ff_0_proj_w,
+                                           transformer.ff_0_proj_w->ne[0],
+                                           transformer.ff_0_proj_w->ne[1] / 2,
+                                           transformer.ff_0_proj_w->nb[1],
+                                           transformer.ff_0_proj_w->nb[1] * transformer.ff_0_proj_w->ne[1] / 2);  // [in_channels * 4, ]
+                auto gate_b = ggml_view_1d(ctx,
+                                           transformer.ff_0_proj_b,
+                                           transformer.ff_0_proj_b->ne[0] / 2,
+                                           transformer.ff_0_proj_b->nb[0] * transformer.ff_0_proj_b->ne[0] / 2);  // [in_channels * 4, ]
+                x = ggml_reshape_2d(ctx, x, c, w * h * n);
+                auto x_in = x;
+                x = ggml_mul_mat(ctx, x_w, x_in);  // [N * h * w, in_channels * 4]
+                x = ggml_add(ctx, ggml_repeat(ctx, x_b, x), x);
+                auto gate = ggml_mul_mat(ctx, gate_w, x_in);  // [N * h * w, in_channels * 4]
+                gate = ggml_add(ctx, ggml_repeat(ctx, gate_b, gate), gate);
+
+                gate = ggml_gelu_inplace(ctx, gate);
+
+                x = ggml_mul(ctx, x, gate);  // [N * h * w, in_channels * 4]
+                // fc
+                x = ggml_mul_mat(ctx, transformer.ff_2_w, x);  // [N * h * w, in_channels]
+                x = ggml_add(ctx, ggml_repeat(ctx, transformer.ff_2_b, x), x);
+            }
+
+            x = ggml_reshape_4d(ctx, x, c, w, h, n);  // [N, h, w, in_channels]
+
+            // residual
+            x = ggml_add(ctx, x, r);
+        }
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3));  // // [N, in_channels, h, w]
+
+        // proj_out
+        x = ggml_conv_2d(ctx, proj_out_w, x, 1, 1, 0, 0, 1, 1);
+        x = ggml_add(ctx,
+                     x,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, proj_out_b, 1, 1, proj_out_b->ne[0], 1),
+                                 x));  // [N, in_channels, h, w]
+        x = ggml_add(ctx, x, x_in);
+        return x;
+    }
+};
+
+struct DownSample {
+    // hparams
+    int channels;
+    int out_channels;
+
+    // conv2d params
+    struct ggml_tensor* op_w;  // [out_channels, channels, 3, 3]
+    struct ggml_tensor* op_b;  // [out_channels,]
+
+    bool vae_downsample = false;
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
+        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
+        mem_size += 2 * ggml_tensor_overhead();                                        // object overhead
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
+        op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        if (vae_downsample) {
+            tensors[prefix + "conv.weight"] = op_w;
+            tensors[prefix + "conv.bias"] = op_b;
+        } else {
+            tensors[prefix + "op.weight"] = op_w;
+            tensors[prefix + "op.bias"] = op_b;
+        }
+    }
+
+    // TODO: making it parallel
+    static void asymmetric_pad(struct ggml_tensor* dst,
+                               const struct ggml_tensor* a,
+                               const struct ggml_tensor* b,
+                               int ith,
+                               int nth,
+                               void* userdata) {
+        assert(sizeof(dst->nb[0]) == sizeof(float));
+        assert(sizeof(a->nb[0]) == sizeof(float));
+        assert(sizeof(b->nb[0]) == sizeof(float));
+        float value = 0;
+
+        for (int i = 0; i < dst->ne[3]; i++) {
+            for (int j = 0; j < dst->ne[2]; j++) {
+                for (int k = 0; k < dst->ne[1]; k++) {
+                    for (int l = 0; l < dst->ne[0]; l++) {
+                        if (k == dst->ne[1] - 1 || l == dst->ne[0] - 1) {
+                            value = 0;
+                        } else {
+                            value = ggml_tensor_get_f32(b, l, k, j, i);
+                        }
+                        // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
+                        ggml_tensor_set_f32(dst, value, l, k, j, i);
+                    }
+                }
+            }
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        if (vae_downsample) {
+            bool dynamic = ggml_get_dynamic(ctx);
+            ggml_set_dynamic(ctx, false);
+            auto pad_x = ggml_new_tensor_4d(ctx, x->type, x->ne[0] + 1, x->ne[1] + 1, x->ne[2], x->ne[3]);
+            ggml_set_dynamic(ctx, dynamic);
+
+            x = ggml_map_custom2_inplace(ctx, pad_x, x, asymmetric_pad, 1, NULL);
+            x = ggml_conv_2d(ctx, op_w, x, 2, 2, 0, 0, 1, 1);
+        } else {
+            x = ggml_conv_2d(ctx, op_w, x, 2, 2, 1, 1, 1, 1);
+        }
+        x = ggml_add(ctx,
+                     x,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, op_b, 1, 1, op_b->ne[0], 1),
+                                 x));  // [N, out_channels, h/2, w/2]
+        return x;
+    }
+};
+
+struct UpSample {
+    // hparams
+    int channels;
+    int out_channels;
+
+    // conv2d params
+    struct ggml_tensor* conv_w;  // [out_channels, channels, 3, 3]
+    struct ggml_tensor* conv_b;  // [out_channels,]
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
+        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
+        mem_size += 2 * ggml_tensor_overhead();                                        // object overhead
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
+        conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "conv.weight"] = conv_w;
+        tensors[prefix + "conv.bias"] = conv_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        x = ggml_upscale(ctx, x, 2);  // [N, channels, h*2, w*2]
+        x = ggml_conv_2d(ctx, conv_w, x, 1, 1, 1, 1, 1, 1);
+
+        x = ggml_add(ctx,
+                     x,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, conv_b, 1, 1, conv_b->ne[0], 1),
+                                 x));  // [N, out_channels, h*2, w*2]
+        return x;
+    }
+};
+
+// ldm.modules.diffusionmodules.openaimodel.UNetModel
+struct UNetModel {
+    // network hparams
+    int in_channels = 4;
+    int model_channels = 320;
+    int out_channels = 4;
+    int num_res_blocks = 2;
+    int attention_resolutions[3] = {4, 2, 1};
+    int channel_mult[4] = {1, 2, 4, 4};
+    int time_embed_dim = 1280;  // model_channels*4
+    int num_heads = 8;
+    int num_head_channels = -1;  // channels // num_heads
+    int context_dim = 768;       // 1024 for SD2.x
+
+    // network params
+    struct ggml_tensor* time_embed_0_w;  // [time_embed_dim, model_channels]
+    struct ggml_tensor* time_embed_0_b;  // [time_embed_dim, ]
+    // time_embed_1 is nn.SILU()
+    struct ggml_tensor* time_embed_2_w;  // [time_embed_dim, time_embed_dim]
+    struct ggml_tensor* time_embed_2_b;  // [time_embed_dim, ]
+
+    struct ggml_tensor* input_block_0_w;  // [model_channels, in_channels, 3, 3]
+    struct ggml_tensor* input_block_0_b;  // [model_channels, ]
+
+    // input_blocks
+    ResBlock input_res_blocks[4][2];
+    SpatialTransformer input_transformers[3][2];
+    DownSample input_down_samples[3];
+
+    // middle_block
+    ResBlock middle_block_0;
+    SpatialTransformer middle_block_1;
+    ResBlock middle_block_2;
+
+    // output_blocks
+    ResBlock output_res_blocks[4][3];
+    SpatialTransformer output_transformers[3][3];
+    UpSample output_up_samples[3];
+
+    // out
+    // group norm 32
+    struct ggml_tensor* out_0_w;  // [model_channels, ]
+    struct ggml_tensor* out_0_b;  // [model_channels, ]
+    // out 1 is nn.SILU()
+    struct ggml_tensor* out_2_w;  // [out_channels, model_channels, 3, 3]
+    struct ggml_tensor* out_2_b;  // [out_channels, ]
+
+    UNetModel(ModelType model_type = SD1) {
+        if (model_type == SD2) {
+            context_dim = 1024;
+            num_head_channels = 64;
+            num_heads = -1;
+        }
+        // set up hparams of blocks
+
+        // input_blocks
+        std::vector<int> input_block_chans;
+        input_block_chans.push_back(model_channels);
+        int ch = model_channels;
+        int ds = 1;
+
+        int len_mults = sizeof(channel_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_res_blocks[i][j].channels = ch;
+                input_res_blocks[i][j].emb_channels = time_embed_dim;
+                input_res_blocks[i][j].out_channels = mult * model_channels;
+
+                ch = mult * model_channels;
+
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    int n_head = num_heads;
+                    int d_head = ch / num_heads;
+                    if (num_head_channels != -1) {
+                        d_head = num_head_channels;
+                        n_head = ch / d_head;
+                    }
+                    input_transformers[i][j].in_channels = ch;
+                    input_transformers[i][j].n_head = n_head;
+                    input_transformers[i][j].d_head = d_head;
+                    input_transformers[i][j].context_dim = context_dim;
+                }
+                input_block_chans.push_back(ch);
+            }
+            if (i != len_mults - 1) {
+                input_down_samples[i].channels = ch;
+                input_down_samples[i].out_channels = ch;
+                input_block_chans.push_back(ch);
+
+                ds *= 2;
+            }
+        }
+
+        // middle blocks
+        middle_block_0.channels = ch;
+        middle_block_0.emb_channels = time_embed_dim;
+        middle_block_0.out_channels = ch;
+
+        int n_head = num_heads;
+        int d_head = ch / num_heads;
+        if (num_head_channels != -1) {
+            d_head = num_head_channels;
+            n_head = ch / d_head;
+        }
+        middle_block_1.in_channels = ch;
+        middle_block_1.n_head = n_head;
+        middle_block_1.d_head = d_head;
+        middle_block_1.context_dim = context_dim;
+
+        middle_block_2.channels = ch;
+        middle_block_2.emb_channels = time_embed_dim;
+        middle_block_2.out_channels = ch;
+
+        // output blocks
+        for (int i = len_mults - 1; i >= 0; i--) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                int ich = input_block_chans.back();
+                input_block_chans.pop_back();
+
+                output_res_blocks[i][j].channels = ch + ich;
+                output_res_blocks[i][j].emb_channels = time_embed_dim;
+                output_res_blocks[i][j].out_channels = mult * model_channels;
+
+                ch = mult * model_channels;
+
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    int n_head = num_heads;
+                    int d_head = ch / num_heads;
+                    if (num_head_channels != -1) {
+                        d_head = num_head_channels;
+                        n_head = ch / d_head;
+                    }
+                    output_transformers[i][j].in_channels = ch;
+                    output_transformers[i][j].n_head = n_head;
+                    output_transformers[i][j].d_head = d_head;
+                    output_transformers[i][j].context_dim = context_dim;
+                }
+
+                if (i > 0 && j == num_res_blocks) {
+                    output_up_samples[i - 1].channels = ch;
+                    output_up_samples[i - 1].out_channels = ch;
+
+                    ds /= 2;
+                }
+            }
+        }
+    }
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += time_embed_dim * model_channels * ggml_type_sizef(wtype);  // time_embed_0_w
+        mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32);           // time_embed_0_b
+        mem_size += time_embed_dim * time_embed_dim * ggml_type_sizef(wtype);  // time_embed_2_w
+        mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32);           // time_embed_2_b
+
+        mem_size += model_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // input_block_0_w
+        mem_size += model_channels * ggml_type_sizef(GGML_TYPE_F32);                        // input_block_0_b
+
+        mem_size += 6 * ggml_tensor_overhead();  // object overhead
+
+        // input_blocks
+        int ds = 1;
+        int len_mults = sizeof(channel_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                mem_size += input_res_blocks[i][j].compute_params_mem_size(wtype);
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    mem_size += input_transformers[i][j].compute_params_mem_size(wtype);
+                }
+            }
+            if (i != len_mults - 1) {
+                ds *= 2;
+                mem_size += input_down_samples[i].compute_params_mem_size(wtype);
+            }
+        }
+
+        // middle_block
+        mem_size += middle_block_0.compute_params_mem_size(wtype);
+        mem_size += middle_block_1.compute_params_mem_size(wtype);
+        mem_size += middle_block_2.compute_params_mem_size(wtype);
+
+        // output_blocks
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                mem_size += output_res_blocks[i][j].compute_params_mem_size(wtype);
+
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    mem_size += output_transformers[i][j].compute_params_mem_size(wtype);
+                }
+
+                if (i > 0 && j == num_res_blocks) {
+                    mem_size += output_up_samples[i - 1].compute_params_mem_size(wtype);
+
+                    ds /= 2;
+                }
+            }
+        }
+
+        // out
+        mem_size += 2 * model_channels * ggml_type_sizef(GGML_TYPE_F32);                     // out_0_w/b
+        mem_size += out_channels * model_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // out_2_w
+        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                           // out_2_b
+
+        mem_size += 4 * ggml_tensor_overhead();
+
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        time_embed_0_w = ggml_new_tensor_2d(ctx, wtype, model_channels, time_embed_dim);
+        time_embed_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
+
+        time_embed_2_w = ggml_new_tensor_2d(ctx, wtype, time_embed_dim, time_embed_dim);
+        time_embed_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
+
+        // input_blocks
+        input_block_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, model_channels);
+        input_block_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
+        int ds = 1;
+        int len_mults = sizeof(channel_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_res_blocks[i][j].init_params(ctx, wtype);
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    input_transformers[i][j].init_params(ctx, wtype);
+                }
+            }
+            if (i != len_mults - 1) {
+                input_down_samples[i].init_params(ctx, wtype);
+                ds *= 2;
+            }
+        }
+
+        // middle_blocks
+        middle_block_0.init_params(ctx, wtype);
+        middle_block_1.init_params(ctx, wtype);
+        middle_block_2.init_params(ctx, wtype);
+
+        // output_blocks
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                output_res_blocks[i][j].init_params(ctx, wtype);
+
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    output_transformers[i][j].init_params(ctx, wtype);
+                }
+
+                if (i > 0 && j == num_res_blocks) {
+                    output_up_samples[i - 1].init_params(ctx, wtype);
+
+                    ds /= 2;
+                }
+            }
+        }
+
+        // out
+        out_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
+        out_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
+
+        out_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, model_channels, out_channels);
+        out_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "time_embed.0.weight"] = time_embed_0_w;
+        tensors[prefix + "time_embed.0.bias"] = time_embed_0_b;
+
+        tensors[prefix + "time_embed.2.weight"] = time_embed_2_w;
+        tensors[prefix + "time_embed.2.bias"] = time_embed_2_b;
+
+        // input_blocks
+        tensors[prefix + "input_blocks.0.0.weight"] = input_block_0_w;
+        tensors[prefix + "input_blocks.0.0.bias"] = input_block_0_b;
+
+        int len_mults = sizeof(channel_mult) / sizeof(int);
+        int input_block_idx = 0;
+        int ds = 1;
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_block_idx += 1;
+
+                input_res_blocks[i][j].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".0.");
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    input_transformers[i][j].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".1.");
+                }
+            }
+            if (i != len_mults - 1) {
+                input_block_idx += 1;
+                input_down_samples[i].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".0.");
+                ds *= 2;
+            }
+        }
+
+        // middle_blocks
+        middle_block_0.map_by_name(tensors, prefix + "middle_block.0.");
+        middle_block_1.map_by_name(tensors, prefix + "middle_block.1.");
+        middle_block_2.map_by_name(tensors, prefix + "middle_block.2.");
+
+        // output_blocks
+        int output_block_idx = 0;
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                output_res_blocks[i][j].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + ".0.");
+
+                int up_sample_idx = 1;
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    output_transformers[i][j].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + ".1.");
+                    up_sample_idx++;
+                }
+
+                if (i > 0 && j == num_res_blocks) {
+                    output_up_samples[i - 1].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx) + ".");
+
+                    ds /= 2;
+                }
+                output_block_idx += 1;
+            }
+        }
+
+        // out
+        tensors[prefix + "out.0.weight"] = out_0_w;
+        tensors[prefix + "out.0.bias"] = out_0_b;
+        tensors[prefix + "out.2.weight"] = out_2_w;
+        tensors[prefix + "out.2.bias"] = out_2_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* timesteps,
+                                struct ggml_tensor* context,
+                                struct ggml_tensor* t_emb = NULL) {
+        // x: [N, in_channels, h, w]
+        // timesteps: [N, ]
+        // t_emb: [N, model_channels]
+        // context: [N, max_position, hidden_size]([N, 77, 768])
+        if (t_emb == NULL && timesteps != NULL) {
+            t_emb = new_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
+        }
+
+        // time_embed
+        auto emb = ggml_mul_mat(ctx, time_embed_0_w, t_emb);
+        emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_0_b, emb), emb);
+        emb = ggml_silu_inplace(ctx, emb);
+        emb = ggml_mul_mat(ctx, time_embed_2_w, emb);
+        emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_2_b, emb), emb);  // [N, time_embed_dim]
+
+        // input_blocks
+        std::vector<struct ggml_tensor*> hs;
+        // input block 0
+        auto h = ggml_conv_2d(ctx, input_block_0_w, x, 1, 1, 1, 1, 1, 1);  // [N, model_channels, h, w]
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, input_block_0_b, 1, 1, input_block_0_b->ne[0], 1),
+                                 h));  // [N, model_channels, h, w]
+        hs.push_back(h);
+        // input block 1-11
+        int len_mults = sizeof(channel_mult) / sizeof(int);
+        int ds = 1;
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                h = input_res_blocks[i][j].forward(ctx, h, emb);  // [N, mult*model_channels, h, w]
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    h = input_transformers[i][j].forward(ctx, h, context);  // [N, mult*model_channels, h, w]
+                }
+                hs.push_back(h);
+            }
+            if (i != len_mults - 1) {
+                ds *= 2;
+                h = input_down_samples[i].forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
+                hs.push_back(h);
+            }
+        }
+        // [N, 4*model_channels, h/8, w/8]
+
+        // middle_block
+        h = middle_block_0.forward(ctx, h, emb);      // [N, 4*model_channels, h/8, w/8]
+        h = middle_block_1.forward(ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
+        h = middle_block_2.forward(ctx, h, emb);      // [N, 4*model_channels, h/8, w/8]
+
+        // output_blocks
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                auto h_skip = hs.back();
+                hs.pop_back();
+
+                h = ggml_concat(ctx, h, h_skip);
+                h = output_res_blocks[i][j].forward(ctx, h, emb);
+
+                if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
+                    h = output_transformers[i][j].forward(ctx, h, context);
+                }
+
+                if (i > 0 && j == num_res_blocks) {
+                    h = output_up_samples[i - 1].forward(ctx, h);
+
+                    ds /= 2;
+                }
+            }
+        }
+
+        // out
+        // group norm 32
+        h = ggml_group_norm_32(ctx, h);
+        h = ggml_add(ctx,
+                     ggml_mul(ctx,
+                              ggml_repeat(ctx,
+                                          ggml_reshape_4d(ctx, out_0_w, 1, 1, out_0_w->ne[0], 1),
+                                          h),
+                              h),
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, out_0_b, 1, 1, out_0_b->ne[0], 1),
+                                 h));
+        // silu
+        h = ggml_silu_inplace(ctx, h);
+        // conv2d
+        h = ggml_conv_2d(ctx, out_2_w, h, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, out_2_b, 1, 1, out_2_b->ne[0], 1),
+                                 h));  // [N, out_channels, h, w]
+
+        return h;
+    }
+};
+
+/*================================================== AutoEncoderKL ===================================================*/
+
+struct ResnetBlock {
+    // network hparams
+    int in_channels;
+    int out_channels;
+
+    // network params
+    struct ggml_tensor* norm1_w;  // [in_channels, ]
+    struct ggml_tensor* norm1_b;  // [in_channels, ]
+
+    struct ggml_tensor* conv1_w;  // [out_channels, in_channels, 3, 3]
+    struct ggml_tensor* conv1_b;  // [out_channels, ]
+
+    struct ggml_tensor* norm2_w;  // [out_channels, ]
+    struct ggml_tensor* norm2_b;  // [out_channels, ]
+
+    struct ggml_tensor* conv2_w;  // [out_channels, out_channels, 3, 3]
+    struct ggml_tensor* conv2_b;  // [out_channels, ]
+
+    // nin_shortcut, only if out_channels != in_channels
+    struct ggml_tensor* nin_shortcut_w;  // [out_channels, in_channels, 1, 1]
+    struct ggml_tensor* nin_shortcut_b;  // [out_channels, ]
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                      // norm1_w/b
+        mem_size += out_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);   // conv1_w
+        mem_size += 4 * out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // conv1_b/norm2_w/norm2_b/conv2_b
+        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv2_w
+
+        mem_size += 8 * ggml_tensor_overhead();  // object overhead
+
+        if (out_channels != in_channels) {
+            mem_size += out_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // nin_shortcut_w
+            mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                        // nin_shortcut_b
+
+            mem_size += 2 * ggml_tensor_overhead();  // object overhead
+        }
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, out_channels);
+        conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
+        conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        if (out_channels != in_channels) {
+            nin_shortcut_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, out_channels);
+            nin_shortcut_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        }
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm1.weight"] = norm1_w;
+        tensors[prefix + "norm1.bias"] = norm1_b;
+        tensors[prefix + "conv1.weight"] = conv1_w;
+        tensors[prefix + "conv1.bias"] = conv1_b;
+
+        tensors[prefix + "norm2.weight"] = norm2_w;
+        tensors[prefix + "norm2.bias"] = norm2_b;
+        tensors[prefix + "conv2.weight"] = conv2_w;
+        tensors[prefix + "conv2.bias"] = conv2_b;
+
+        if (out_channels != in_channels) {
+            tensors[prefix + "nin_shortcut.weight"] = nin_shortcut_w;
+            tensors[prefix + "nin_shortcut.bias"] = nin_shortcut_b;
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [N, in_channels, h, w]
+
+        // group norm 32
+        auto h = ggml_group_norm_32(ctx, z);
+        h = ggml_mul(ctx,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, norm1_w, 1, 1, norm1_w->ne[0], 1),
+                                 h),
+                     h);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, norm1_b, 1, 1, norm1_b->ne[0], 1),
+                                 h));
+        // silu
+        h = ggml_silu_inplace(ctx, h);
+        // conv2d
+        h = ggml_conv_2d(ctx, conv1_w, h, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, conv1_b, 1, 1, conv1_b->ne[0], 1),
+                                 h));  // [N, out_channels, h, w]
+
+        // group norm 32
+        h = ggml_group_norm_32(ctx, h);
+        h = ggml_add(ctx,
+                     ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm2_w, 1, 1, norm2_w->ne[0], 1), h), h),
+                     ggml_repeat(ctx, ggml_reshape_4d(ctx, norm2_b, 1, 1, norm2_b->ne[0], 1), h));
+        // silu
+        h = ggml_silu_inplace(ctx, h);
+        // dropout, skip for inference
+        // conv2d
+        h = ggml_conv_2d(ctx, conv2_w, h, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, conv2_b, 1, 1, conv2_b->ne[0], 1),
+                                 h));  // [N, out_channels, h, w
+
+        // skip connection
+        if (out_channels != in_channels) {
+            z = ggml_conv_2d(ctx, nin_shortcut_w, z, 1, 1, 0, 0, 1, 1);
+            z = ggml_add(ctx,
+                         z,
+                         ggml_repeat(ctx,
+                                     ggml_reshape_4d(ctx, nin_shortcut_b, 1, 1, nin_shortcut_b->ne[0], 1),
+                                     z));  // [N, out_channels, h, w]
+        }
+        h = ggml_add(ctx, h, z);
+        return h;  // [N, out_channels, h, w]
+    }
+};
+
+struct AttnBlock {
+    int in_channels;  // mult * model_channels
+
+    // group norm
+    struct ggml_tensor* norm_w;  // [in_channels,]
+    struct ggml_tensor* norm_b;  // [in_channels,]
+
+    // q/k/v
+    struct ggml_tensor* q_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* q_b;  // [in_channels,]
+    struct ggml_tensor* k_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* k_b;  // [in_channels,]
+    struct ggml_tensor* v_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* v_b;  // [in_channels,]
+
+    // proj_out
+    struct ggml_tensor* proj_out_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* proj_out_b;  // [in_channels,]
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                        // norm_w/norm_b/q_b/k_v/v_b/proj_out_b
+        mem_size += 4 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // q_w/k_w/v_w/proj_out_w
+        mem_size += 10 * ggml_tensor_overhead();                                             // object overhead
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm.weight"] = norm_w;
+        tensors[prefix + "norm.bias"] = norm_b;
+        tensors[prefix + "q.weight"] = q_w;
+        tensors[prefix + "q.bias"] = q_b;
+        tensors[prefix + "k.weight"] = k_w;
+        tensors[prefix + "k.bias"] = k_b;
+        tensors[prefix + "v.weight"] = v_w;
+        tensors[prefix + "v.bias"] = v_b;
+        tensors[prefix + "proj_out.weight"] = proj_out_w;
+        tensors[prefix + "proj_out.bias"] = proj_out_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+
+        // group norm 32
+        auto h_ = ggml_group_norm_32(ctx, x);
+        h_ = ggml_add(ctx,
+                      ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), h_), h_),
+                      ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), h_));
+
+        const int64_t n = h_->ne[3];
+        const int64_t c = h_->ne[2];
+        const int64_t h = h_->ne[1];
+        const int64_t w = h_->ne[0];
+        // q
+        auto q = ggml_conv_2d(ctx, q_w, h_, 1, 1, 0, 0, 1, 1);
+        q = ggml_add(ctx,
+                     q,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, q_b, 1, 1, q_b->ne[0], 1),
+                                 q));  // [N, in_channels, h, w]
+
+        // k
+        auto k = ggml_conv_2d(ctx, k_w, h_, 1, 1, 0, 0, 1, 1);
+        k = ggml_add(ctx,
+                     k,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, k_b, 1, 1, k_b->ne[0], 1),
+                                 k));  // [N, in_channels, h, w]
+
+        // v
+        auto v = ggml_conv_2d(ctx, v_w, h_, 1, 1, 0, 0, 1, 1);
+        v = ggml_add(ctx,
+                     v,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, v_b, 1, 1, v_b->ne[0], 1),
+                                 v));  // [N, in_channels, h, w]
+
+        q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        q = ggml_reshape_3d(ctx, q, c, h * w, n);              // [N, h * w, in_channels]
+
+        k = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        k = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
+
+        auto w_ = ggml_mul_mat(ctx, k, q);  // [N, h * w, h * w]
+        w_ = ggml_scale_inplace(ctx, w_, ggml_new_f32(ctx, 1.0f / sqrt((float)c)));
+        w_ = ggml_soft_max_inplace(ctx, w_);
+
+        v = ggml_reshape_3d(ctx, v, h * w, c, n);                // [N, in_channels, h * w]
+        h_ = ggml_mul_mat(ctx, v, w_);                           // [N, h * w, in_channels]
+        h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
+        h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]
+
+        // proj_out
+        h_ = ggml_conv_2d(ctx, proj_out_w, h_, 1, 1, 0, 0, 1, 1);
+        h_ = ggml_add(ctx,
+                      h_,
+                      ggml_repeat(ctx,
+                                  ggml_reshape_4d(ctx, proj_out_b, 1, 1, proj_out_b->ne[0], 1),
+                                  h_));  // [N, in_channels, h, w]
+        h_ = ggml_add(ctx, h_, x);
+        return h_;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Encoder
+struct Encoder {
+    int embed_dim = 4;
+    int ch = 128;
+    int z_channels = 4;
+    int in_channels = 3;
+    int num_res_blocks = 2;
+    int ch_mult[4] = {1, 2, 4, 4};
+
+    struct ggml_tensor* conv_in_w;  // [ch, in_channels, 3, 3]
+    struct ggml_tensor* conv_in_b;  // [ch, ]
+
+    ResnetBlock down_blocks[4][2];
+    DownSample down_samples[3];
+
+    struct
+    {
+        ResnetBlock block_1;
+        AttnBlock attn_1;
+        ResnetBlock block_2;
+    } mid;
+
+    // block_in = ch * ch_mult[len_mults - 1]
+    struct ggml_tensor* norm_out_w;  // [block_in, ]
+    struct ggml_tensor* norm_out_b;  // [block_in, ]
+
+    struct ggml_tensor* conv_out_w;  // [embed_dim*2, block_in, 3, 3]
+    struct ggml_tensor* conv_out_b;  // [embed_dim*2, ]
+
+    Encoder() {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+
+        int block_in = 1;
+        for (int i = 0; i < len_mults; i++) {
+            if (i == 0) {
+                block_in = ch;
+            } else {
+                block_in = ch * ch_mult[i - 1];
+            }
+            int block_out = ch * ch_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                down_blocks[i][j].in_channels = block_in;
+                down_blocks[i][j].out_channels = block_out;
+                block_in = block_out;
+            }
+            if (i != len_mults - 1) {
+                down_samples[i].channels = block_in;
+                down_samples[i].out_channels = block_in;
+                down_samples[i].vae_downsample = true;
+            }
+        }
+
+        mid.block_1.in_channels = block_in;
+        mid.block_1.out_channels = block_in;
+        mid.attn_1.in_channels = block_in;
+        mid.block_2.in_channels = block_in;
+        mid.block_2.out_channels = block_in;
+    }
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in = ch * ch_mult[len_mults - 1];
+
+        mem_size += ch * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_in_w
+        mem_size += ch * ggml_type_sizef(GGML_TYPE_F32);                        // conv_in_b
+
+        mem_size += 2 * block_in * ggml_type_sizef(GGML_TYPE_F32);  // norm_out_w/b
+
+        mem_size += z_channels * 2 * block_in * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_out_w
+        mem_size += z_channels * 2 * ggml_type_sizef(GGML_TYPE_F32);                     // conv_out_b
+
+        mem_size += 6 * ggml_tensor_overhead();  // object overhead
+
+        mem_size += mid.block_1.compute_params_mem_size(wtype);
+        mem_size += mid.attn_1.compute_params_mem_size(wtype);
+        mem_size += mid.block_2.compute_params_mem_size(wtype);
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                mem_size += down_blocks[i][j].compute_params_mem_size(wtype);
+            }
+            if (i != 0) {
+                mem_size += down_samples[i - 1].compute_params_mem_size(wtype);
+            }
+        }
+
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in = ch * ch_mult[len_mults - 1];
+
+        conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, ch);
+        conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch);
+
+        norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
+        norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
+
+        conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, block_in, z_channels * 2);
+        conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels * 2);
+
+        mid.block_1.init_params(ctx, wtype);
+        mid.attn_1.init_params(ctx, wtype);
+        mid.block_2.init_params(ctx, wtype);
+
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                down_blocks[i][j].init_params(ctx, wtype);
+            }
+            if (i != len_mults - 1) {
+                down_samples[i].init_params(ctx, wtype);
+            }
+        }
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm_out.weight"] = norm_out_w;
+        tensors[prefix + "norm_out.bias"] = norm_out_b;
+        tensors[prefix + "conv_in.weight"] = conv_in_w;
+        tensors[prefix + "conv_in.bias"] = conv_in_b;
+        tensors[prefix + "conv_out.weight"] = conv_out_w;
+        tensors[prefix + "conv_out.bias"] = conv_out_b;
+
+        mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
+        mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
+        mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                down_blocks[i][j].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".block." + std::to_string(j) + ".");
+            }
+            if (i != len_mults - 1) {
+                down_samples[i].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".downsample.");
+            }
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+
+        // conv_in
+        auto h = ggml_conv_2d(ctx, conv_in_w, x, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
+                                 h));  // [N, ch, h, w]
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                h = down_blocks[i][j].forward(ctx, h);
+            }
+            if (i != len_mults - 1) {
+                h = down_samples[i].forward(ctx, h);
+            }
+        }
+
+        h = mid.block_1.forward(ctx, h);
+        h = mid.attn_1.forward(ctx, h);
+        h = mid.block_2.forward(ctx, h);  // [N, block_in, h, w]
+
+        // group norm 32
+        h = ggml_group_norm_32(ctx, h);
+        h = ggml_add(ctx,
+                     ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_w, 1, 1, norm_out_w->ne[0], 1), h), h),
+                     ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_b, 1, 1, norm_out_b->ne[0], 1), h));
+
+        // silu
+        // silu
+        h = ggml_silu_inplace(ctx, h);
+
+        // conv_out
+        h = ggml_conv_2d(ctx, conv_out_w, h, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, conv_out_b, 1, 1, conv_out_b->ne[0], 1),
+                                 h));  // [N, z_channels*2, h, w]
+
+        return h;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Decoder
+struct Decoder {
+    int embed_dim = 4;
+    int ch = 128;
+    int z_channels = 4;
+    int out_ch = 3;
+    int num_res_blocks = 2;
+    int ch_mult[4] = {1, 2, 4, 4};
+
+    // block_in = ch *  ch_mult[-1], 512
+    struct ggml_tensor* conv_in_w;  // [block_in, z_channels, 3, 3]
+    struct ggml_tensor* conv_in_b;  // [block_in, ]
+
+    struct
+    {
+        ResnetBlock block_1;
+        AttnBlock attn_1;
+        ResnetBlock block_2;
+    } mid;
+
+    ResnetBlock up_blocks[4][3];
+    UpSample up_samples[3];
+
+    struct ggml_tensor* norm_out_w;  // [ch *  ch_mult[0], ]
+    struct ggml_tensor* norm_out_b;  // [ch *  ch_mult[0], ]
+
+    struct ggml_tensor* conv_out_w;  // [out_ch, ch *  ch_mult[0], 3, 3]
+    struct ggml_tensor* conv_out_b;  // [out_ch, ]
+
+    Decoder() {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in = ch * ch_mult[len_mults - 1];
+
+        mid.block_1.in_channels = block_in;
+        mid.block_1.out_channels = block_in;
+        mid.attn_1.in_channels = block_in;
+        mid.block_2.in_channels = block_in;
+        mid.block_2.out_channels = block_in;
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            int mult = ch_mult[i];
+            int block_out = ch * mult;
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                up_blocks[i][j].in_channels = block_in;
+                up_blocks[i][j].out_channels = block_out;
+                block_in = block_out;
+            }
+            if (i != 0) {
+                up_samples[i - 1].channels = block_in;
+                up_samples[i - 1].out_channels = block_in;
+            }
+        }
+    }
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in = ch * ch_mult[len_mults - 1];
+
+        mem_size += block_in * z_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_in_w
+        mem_size += block_in * ggml_type_sizef(GGML_TYPE_F32);                       // conv_in_b
+
+        mem_size += 2 * (ch * ch_mult[0]) * ggml_type_sizef(GGML_TYPE_F32);  // norm_out_w/b
+
+        mem_size += (ch * ch_mult[0]) * out_ch * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_out_w
+        mem_size += out_ch * ggml_type_sizef(GGML_TYPE_F32);                              // conv_out_b
+
+        mem_size += 8 * ggml_tensor_overhead();  // object overhead
+
+        mem_size += mid.block_1.compute_params_mem_size(wtype);
+        mem_size += mid.attn_1.compute_params_mem_size(wtype);
+        mem_size += mid.block_2.compute_params_mem_size(wtype);
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                mem_size += up_blocks[i][j].compute_params_mem_size(wtype);
+            }
+            if (i != 0) {
+                mem_size += up_samples[i - 1].compute_params_mem_size(wtype);
+            }
+        }
+
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in = ch * ch_mult[len_mults - 1];
+
+        norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
+        norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
+
+        conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, block_in);
+        conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
+
+        conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, ch * ch_mult[0], out_ch);
+        conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch);
+
+        mid.block_1.init_params(ctx, wtype);
+        mid.attn_1.init_params(ctx, wtype);
+        mid.block_2.init_params(ctx, wtype);
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                up_blocks[i][j].init_params(ctx, wtype);
+            }
+            if (i != 0) {
+                up_samples[i - 1].init_params(ctx, wtype);
+            }
+        }
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm_out.weight"] = norm_out_w;
+        tensors[prefix + "norm_out.bias"] = norm_out_b;
+        tensors[prefix + "conv_in.weight"] = conv_in_w;
+        tensors[prefix + "conv_in.bias"] = conv_in_b;
+        tensors[prefix + "conv_out.weight"] = conv_out_w;
+        tensors[prefix + "conv_out.bias"] = conv_out_b;
+
+        mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
+        mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
+        mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                up_blocks[i][j].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".block." + std::to_string(j) + ".");
+            }
+            if (i != 0) {
+                up_samples[i - 1].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".upsample.");
+            }
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+
+        // conv_in
+        auto h = ggml_conv_2d(ctx, conv_in_w, z, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
+                                 h));  // [N, block_in, h, w]
+
+        h = mid.block_1.forward(ctx, h);
+        h = mid.attn_1.forward(ctx, h);
+        h = mid.block_2.forward(ctx, h);  // [N, block_in, h, w]
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                h = up_blocks[i][j].forward(ctx, h);
+            }
+            if (i != 0) {
+                h = up_samples[i - 1].forward(ctx, h);
+            }
+        }
+
+        // group norm 32
+        h = ggml_group_norm_32(ctx, h);
+        h = ggml_add(ctx,
+                     ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_w, 1, 1, norm_out_w->ne[0], 1), h), h),
+                     ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_b, 1, 1, norm_out_b->ne[0], 1), h));
+
+        // silu
+        // silu
+        h = ggml_silu_inplace(ctx, h);
+
+        // conv_out
+        h = ggml_conv_2d(ctx, conv_out_w, h, 1, 1, 1, 1, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, conv_out_b, 1, 1, conv_out_b->ne[0], 1),
+                                 h));  // [N, out_ch, h, w]
+
+        return h;
+    }
+};
+
+// ldm.models.autoencoder.AutoencoderKL
+struct AutoEncoderKL {
+    bool decode_only = true;
+    int embed_dim = 4;
+    struct
+    {
+        int z_channels = 4;
+        int resolution = 256;
+        int in_channels = 3;
+        int out_ch = 3;
+        int ch = 128;
+        int ch_mult[4] = {1, 2, 4, 4};
+        int num_res_blocks = 2;
+    } dd_config;
+
+    struct ggml_tensor* quant_conv_w;  // [2*embed_dim, 2*z_channels, 1, 1]
+    struct ggml_tensor* quant_conv_b;  // [2*embed_dim, ]
+
+    struct ggml_tensor* post_quant_conv_w;  // [z_channels, embed_dim, 1, 1]
+    struct ggml_tensor* post_quant_conv_b;  // [z_channels, ]
+
+    Encoder encoder;
+    Decoder decoder;
+
+    AutoEncoderKL(bool decode_only = false)
+        : decode_only(decode_only) {
+        assert(sizeof(dd_config.ch_mult) == sizeof(encoder.ch_mult));
+        assert(sizeof(dd_config.ch_mult) == sizeof(decoder.ch_mult));
+
+        encoder.embed_dim = embed_dim;
+        decoder.embed_dim = embed_dim;
+        encoder.ch = dd_config.ch;
+        decoder.ch = dd_config.ch;
+        encoder.z_channels = dd_config.z_channels;
+        decoder.z_channels = dd_config.z_channels;
+        encoder.in_channels = dd_config.in_channels;
+        decoder.out_ch = dd_config.out_ch;
+        encoder.num_res_blocks = dd_config.num_res_blocks;
+
+        int len_mults = sizeof(dd_config.ch_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            encoder.ch_mult[i] = dd_config.ch_mult[i];
+            decoder.ch_mult[i] = dd_config.ch_mult[i];
+        }
+    }
+
+    size_t compute_params_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+
+        if (!decode_only) {
+            mem_size += 2 * embed_dim * 2 * dd_config.z_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // quant_conv_w
+            mem_size += 2 * embed_dim * ggml_type_sizef(GGML_TYPE_F32);                                     // quant_conv_b
+            mem_size += encoder.compute_params_mem_size(wtype);
+        }
+
+        mem_size += dd_config.z_channels * embed_dim * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // post_quant_conv_w
+        mem_size += dd_config.z_channels * ggml_type_sizef(GGML_TYPE_F32);                      // post_quant_conv_b
+
+        mem_size += decoder.compute_params_mem_size(wtype);
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        if (!decode_only) {
+            quant_conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, 2 * dd_config.z_channels, 2 * embed_dim);
+            quant_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2 * embed_dim);
+            encoder.init_params(ctx, wtype);
+        }
+
+        post_quant_conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, embed_dim, dd_config.z_channels);
+        post_quant_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dd_config.z_channels);
+        decoder.init_params(ctx, wtype);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        if (!decode_only) {
+            tensors[prefix + "quant_conv.weight"] = quant_conv_w;
+            tensors[prefix + "quant_conv.bias"] = quant_conv_b;
+            encoder.map_by_name(tensors, prefix + "encoder.");
+        }
+
+        tensors[prefix + "post_quant_conv.weight"] = post_quant_conv_w;
+        tensors[prefix + "post_quant_conv.bias"] = post_quant_conv_b;
+        decoder.map_by_name(tensors, prefix + "decoder.");
+    }
+
+    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+
+        // post_quant_conv
+        auto h = ggml_conv_2d(ctx, post_quant_conv_w, z, 1, 1, 0, 0, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, post_quant_conv_b, 1, 1, post_quant_conv_b->ne[0], 1),
+                                 h));  // [N, z_channels, h, w]
+        h = decoder.forward(ctx, h);
+        return h;
+    }
+
+    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+        auto h = encoder.forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
+        // quant_conv
+        h = ggml_conv_2d(ctx, quant_conv_w, h, 1, 1, 0, 0, 1, 1);
+        h = ggml_add(ctx,
+                     h,
+                     ggml_repeat(ctx,
+                                 ggml_reshape_4d(ctx, quant_conv_b, 1, 1, quant_conv_b->ne[0], 1),
+                                 h));  // [N, 2*embed_dim, h/8, w/8]
+        return h;
+    }
+};
+
+/*================================================= CompVisDenoiser ==================================================*/
+
+// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
+
+struct SigmaSchedule {
+    float alphas_cumprod[TIMESTEPS];
+    float sigmas[TIMESTEPS];
+    float log_sigmas[TIMESTEPS];
+
+    virtual std::vector<float> get_sigmas(uint32_t n) = 0;
+
+    float sigma_to_t(float sigma) {
+        float log_sigma = std::log(sigma);
+        std::vector<float> dists;
+        dists.reserve(TIMESTEPS);
+        for (float log_sigma_val : log_sigmas) {
+            dists.push_back(log_sigma - log_sigma_val);
+        }
+
+        int low_idx = 0;
+        for (size_t i = 0; i < TIMESTEPS; i++) {
+            if (dists[i] >= 0) {
+                low_idx++;
+            }
+        }
+        low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
+        int high_idx = low_idx + 1;
+
+        float low = log_sigmas[low_idx];
+        float high = log_sigmas[high_idx];
+        float w = (low - log_sigma) / (low - high);
+        w = std::max(0.f, std::min(1.f, w));
+        float t = (1.0f - w) * low_idx + w * high_idx;
+
+        return t;
+    }
+
+    float t_to_sigma(float t) {
+        int low_idx = static_cast<int>(std::floor(t));
+        int high_idx = static_cast<int>(std::ceil(t));
+        float w = t - static_cast<float>(low_idx);
+        float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
+        return std::exp(log_sigma);
+    }
+};
+
+struct DiscreteSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n) {
+        std::vector<float> result;
+
+        int t_max = TIMESTEPS - 1;
+
+        if (n == 0) {
+            return result;
+        } else if (n == 1) {
+            result.push_back(t_to_sigma(t_max));
+            result.push_back(0);
+            return result;
+        }
+
+        float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
+        for (int i = 0; i < n; ++i) {
+            float t = t_max - step * i;
+            result.push_back(t_to_sigma(t));
+        }
+        result.push_back(0);
+        return result;
+    }
+};
+
+struct KarrasSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n) {
+        // These *COULD* be function arguments here,
+        // but does anybody ever bother to touch them?
+        float sigma_min = 0.1;
+        float sigma_max = 10.;
+        float rho = 7.;
+
+        std::vector<float> result(n + 1);
+
+        float min_inv_rho = pow(sigma_min, (1. / rho));
+        float max_inv_rho = pow(sigma_max, (1. / rho));
+        for (int i = 0; i < n; i++) {
+            // Eq. (5) from Karras et al 2022
+            result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.) * (min_inv_rho - max_inv_rho), rho);
+        }
+        result[n] = 0.;
+        return result;
+    }
+};
+
+struct Denoiser {
+    std::shared_ptr<SigmaSchedule> schedule = std::make_shared<DiscreteSchedule>();
+    virtual std::vector<float> get_scalings(float sigma) = 0;
+};
+
+struct CompVisDenoiser : public Denoiser {
+    float sigma_data = 1.0f;
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_out = -sigma;
+        float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        return {c_out, c_in};
+    }
+};
+
+struct CompVisVDenoiser : public Denoiser {
+    float sigma_data = 1.0f;
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
+        float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        return {c_skip, c_out, c_in};
+    }
+};
+
+/*=============================================== StableDiffusionGGML ================================================*/
+
+class StableDiffusionGGML {
+   public:
+    ggml_context* clip_params_ctx = NULL;
+    ggml_context* unet_params_ctx = NULL;
+    ggml_context* vae_params_ctx = NULL;
+
+    bool dynamic = true;
+    bool vae_decode_only = false;
+    bool free_params_immediately = false;
+
+    std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
+    int32_t ftype = 1;
+    int n_threads = -1;
+    float scale_factor = 0.18215f;
+    size_t max_mem_size = 0;
+    size_t curr_params_mem_size = 0;
+    size_t max_params_mem_size = 0;
+    size_t max_rt_mem_size = 0;
+
+    FrozenCLIPEmbedderWithCustomWords cond_stage_model;
+    UNetModel diffusion_model;
+    AutoEncoderKL first_stage_model;
+
+    std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
+
+    StableDiffusionGGML() = default;
+
+    StableDiffusionGGML(int n_threads,
+                        bool vae_decode_only,
+                        bool free_params_immediately,
+                        RNGType rng_type)
+        : n_threads(n_threads),
+          vae_decode_only(vae_decode_only),
+          free_params_immediately(free_params_immediately) {
+        first_stage_model.decode_only = vae_decode_only;
+        if (rng_type == STD_DEFAULT_RNG) {
+            rng = std::make_shared<STDDefaultRNG>();
+        } else if (rng_type == CUDA_RNG) {
+            rng = std::make_shared<PhiloxRNG>();
+        }
+    }
+
+    ~StableDiffusionGGML() {
+        if (clip_params_ctx != NULL) {
+            ggml_free(clip_params_ctx);
+            clip_params_ctx = NULL;
+        }
+        if (unet_params_ctx != NULL) {
+            ggml_free(unet_params_ctx);
+            unet_params_ctx = NULL;
+        }
+        if (vae_params_ctx != NULL) {
+            ggml_free(vae_params_ctx);
+            vae_params_ctx = NULL;
+        }
+    }
+
+    bool load_from_file(const std::string& file_path, Schedule schedule) {
+        LOG_INFO("loading model from '%s'", file_path.c_str());
+
+        std::ifstream file(file_path, std::ios::binary);
+        if (!file.is_open()) {
+            LOG_ERROR("failed to open '%s'", file_path.c_str());
+            return false;
+        }
+
+        LOG_DEBUG("verifying magic");
+        // verify magic
+        {
+            uint32_t magic;
+            file.read(reinterpret_cast<char*>(&magic), sizeof(magic));
+            if (magic != GGML_FILE_MAGIC) {
+                LOG_ERROR("invalid model file '%s' (bad magic)", file_path.c_str());
+                return false;
+            }
+        }
+
+        LOG_DEBUG("loading hparams");
+        // load hparams
+        file.read(reinterpret_cast<char*>(&ftype), sizeof(ftype));
+
+        int model_type = (ftype >> 16) & 0xFFFF;
+        if (model_type >= MODEL_TYPE_COUNT) {
+            LOG_ERROR("invalid model file '%s' (bad model type value %d)", file_path.c_str(), ftype);
+            return false;
+        }
+        LOG_INFO("model type: %s", model_type_to_str[model_type]);
+
+        if (model_type == SD2) {
+            cond_stage_model = FrozenCLIPEmbedderWithCustomWords((ModelType)model_type);
+            diffusion_model = UNetModel((ModelType)model_type);
+        }
+
+        ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(ftype & 0xFFFF));
+        LOG_INFO("ftype: %s", ggml_type_name(wtype));
+        if (wtype == GGML_TYPE_COUNT) {
+            LOG_ERROR("invalid model file '%s' (bad ftype value %d)", file_path.c_str(), ftype);
+            return false;
+        }
+
+        LOG_DEBUG("loading vocab");
+        // load vocab
+        {
+            int32_t n_vocab = 0;
+            file.read(reinterpret_cast<char*>(&n_vocab), sizeof(n_vocab));
+
+            if (n_vocab != cond_stage_model.text_model.vocab_size) {
+                LOG_ERROR("invalid model file '%s' (bad vocab size %d != %d)",
+                          file_path.c_str(), n_vocab, cond_stage_model.text_model.vocab_size);
+                return false;
+            }
+
+            std::string word;
+            std::vector<char> buf(128);
+
+            for (int i = 0; i < n_vocab; i++) {
+                uint32_t len;
+                file.read((char*)&len, sizeof(len));
+
+                buf.resize(len);
+                file.read((char*)buf.data(), len);
+                word.assign(buf.data(), len);
+
+                cond_stage_model.tokenizer.add_token(word, i);
+            }
+        }
+
+        // create the ggml context for network params
+        LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
+        {
+            // cond_stage_model(FrozenCLIPEmbedder)
+            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
+            ctx_size += cond_stage_model.text_model.compute_params_mem_size(wtype);
+            LOG_DEBUG("clip params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
+
+            struct ggml_init_params params;
+            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = false;
+
+            clip_params_ctx = ggml_init(params);
+            if (!clip_params_ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return false;
+            }
+        }
+
+        {
+            // diffusion_model(UNetModel)
+            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
+            ctx_size += diffusion_model.compute_params_mem_size(wtype);
+            LOG_DEBUG("unet params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
+
+            struct ggml_init_params params;
+            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = false;
+
+            unet_params_ctx = ggml_init(params);
+            if (!unet_params_ctx) {
+                LOG_ERROR("ggml_init() failed");
+                ggml_free(clip_params_ctx);
+                clip_params_ctx = NULL;
+                return false;
+            }
+        }
+
+        {
+            // first_stage_model(AutoEncoderKL)
+            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
+            ctx_size += first_stage_model.compute_params_mem_size(wtype);
+            LOG_DEBUG("vae params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
+
+            struct ggml_init_params params;
+            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = false;
+
+            vae_params_ctx = ggml_init(params);
+            if (!vae_params_ctx) {
+                LOG_ERROR("ggml_init() failed");
+                ggml_free(clip_params_ctx);
+                clip_params_ctx = NULL;
+                ggml_free(unet_params_ctx);
+                unet_params_ctx = NULL;
+                return false;
+            }
+        }
+
+        std::map<std::string, struct ggml_tensor*> tensors;
+
+        LOG_DEBUG("preparing memory for the weights");
+        // prepare memory for the weights
+        {
+            // cond_stage_model(FrozenCLIPEmbedder)
+            cond_stage_model.text_model.init_params(clip_params_ctx, wtype);
+            cond_stage_model.text_model.map_by_name(tensors, "cond_stage_model.transformer.text_model.");
+
+            // diffusion_model(UNetModel)
+            diffusion_model.init_params(unet_params_ctx, wtype);
+            diffusion_model.map_by_name(tensors, "model.diffusion_model.");
+
+            // firest_stage_model(AutoEncoderKL)
+            first_stage_model.init_params(vae_params_ctx, wtype);
+            first_stage_model.map_by_name(tensors, "first_stage_model.");
+        }
+
+        LOG_DEBUG("loading weights");
+        std::set<std::string> tensor_names_in_file;
+        int64_t t0 = ggml_time_ms();
+        // load weights
+        float alphas_cumprod[TIMESTEPS];
+        {
+            int n_tensors = 0;
+            size_t total_size = 0;
+
+            while (true) {
+                int32_t n_dims;
+                int32_t length;
+                int32_t ttype;
+
+                file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+                file.read(reinterpret_cast<char*>(&length), sizeof(length));
+                file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+
+                if (file.eof()) {
+                    break;
+                }
+
+                int32_t nelements = 1;
+                int32_t ne[4] = {1, 1, 1, 1};
+                for (int i = 0; i < n_dims; ++i) {
+                    file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
+                    nelements *= ne[i];
+                }
+
+                const size_t num_bytes = nelements / ggml_blck_size(ggml_type(ttype)) * ggml_type_size(ggml_type(ttype));
+
+                std::string name(length, 0);
+                file.read(&name[0], length);
+
+                tensor_names_in_file.insert(std::string(name.data()));
+
+                if (std::string(name.data()) == "alphas_cumprod") {
+                    file.read(reinterpret_cast<char*>(alphas_cumprod), nelements * ggml_type_size((ggml_type)ttype));
+                    continue;
+                }
+
+                struct ggml_tensor* tensor;
+                if (tensors.find(name.data()) != tensors.end()) {
+                    tensor = tensors[name.data()];
+                } else {
+                    if (name.find("quant") == std::string::npos && name.find("first_stage_model.encoder.") == std::string::npos) {
+                        LOG_WARN("unknown tensor '%s' in model file", name.data());
+                    } else {
+                        if (!vae_decode_only) {
+                            LOG_WARN("unknown tensor '%s' in model file", name.data());
+                            return false;
+                        }
+                    }
+                    file.ignore(num_bytes);
+                    continue;
+                }
+
+                if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) {
+                    LOG_ERROR(
+                        "tensor '%s' has wrong shape in model file: "
+                        "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
+                        name.data(),
+                        ne[0], ne[1], ne[2], ne[3],
+                        (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]);
+                    return false;
+                }
+
+                if (ggml_nelements(tensor) != nelements) {
+                    LOG_ERROR(
+                        "tensor '%s' has wrong number of elements in model file: "
+                        "got %u, expert %zu",
+                        name.data(), nelements, ggml_nelements(tensor));
+                    return false;
+                }
+
+                if (tensor->type != ttype) {
+                    LOG_ERROR("tensor '%s' has wrong type in model file: got %s, expect %s",
+                              name.data(), ggml_type_name(ggml_type(ttype)), ggml_type_name(tensor->type));
+                    return false;
+                }
+
+                file.read(reinterpret_cast<char*>(tensor->data), num_bytes);
+
+                total_size += ggml_nbytes(tensor);
+            }
+            bool some_tensor_not_init = false;
+            for (auto pair : tensors) {
+                if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
+                    continue;
+                }
+                if (tensor_names_in_file.find(pair.first) == tensor_names_in_file.end()) {
+                    LOG_ERROR("tensor '%s' not in model file", pair.first.c_str());
+                    some_tensor_not_init = true;
+                }
+            }
+            if (tensor_names_in_file.find("alphas_cumprod") == tensor_names_in_file.end()) {
+                LOG_ERROR("tensor alphas_cumprod not in model file");
+                some_tensor_not_init = true;
+            }
+            if (some_tensor_not_init) {
+                file.close();
+                return false;
+            }
+            LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
+        }
+        max_params_mem_size = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx);
+        max_mem_size = max_params_mem_size;
+        curr_params_mem_size = max_params_mem_size;
+        LOG_INFO("total params size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)",
+                 max_params_mem_size / 1024.0 / 1024.0,
+                 ggml_used_mem(clip_params_ctx) / 1024.0 / 1024.0,
+                 ggml_used_mem(unet_params_ctx) / 1024.0 / 1024.0,
+                 ggml_used_mem(vae_params_ctx) / 1024.0 / 1024.0);
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("loading model from '%s' completed, taking %.2fs", file_path.c_str(), (t1 - t0) * 1.0f / 1000);
+        file.close();
+
+        // check is_using_v_parameterization_for_sd2
+        bool is_using_v_parameterization = false;
+        if (model_type == SD2) {
+            struct ggml_init_params params;
+            params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = false;
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return false;
+            }
+            if (is_using_v_parameterization_for_sd2(ctx)) {
+                is_using_v_parameterization = true;
+            }
+        }
+
+        if (is_using_v_parameterization) {
+            denoiser = std::make_shared<CompVisVDenoiser>();
+            LOG_INFO("running in v-prediction mode");
+        } else {
+            LOG_INFO("running in eps-prediction mode");
+        }
+
+        if (schedule != DEFAULT) {
+            switch (schedule) {
+                case DISCRETE:
+                    LOG_INFO("running with discrete schedule");
+                    denoiser->schedule = std::make_shared<DiscreteSchedule>();
+                    break;
+                case KARRAS:
+                    LOG_INFO("running with Karras schedule");
+                    denoiser->schedule = std::make_shared<KarrasSchedule>();
+                    break;
+                case DEFAULT:
+                    // Don't touch anything.
+                    break;
+                default:
+                    LOG_ERROR("Unknown schedule %i", schedule);
+                    abort();
+            }
+        }
+
+        for (int i = 0; i < TIMESTEPS; i++) {
+            denoiser->schedule->alphas_cumprod[i] = alphas_cumprod[i];
+            denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
+            denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
+        }
+
+        return true;
+    }
+
+    bool is_using_v_parameterization_for_sd2(ggml_context* res_ctx) {
+        struct ggml_tensor* x_t = ggml_new_tensor_4d(res_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
+        ggml_set_f32(x_t, 0.5);
+        struct ggml_tensor* c = ggml_new_tensor_4d(res_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
+        ggml_set_f32(c, 0.5);
+
+        struct ggml_cplan cplan;
+
+        size_t ctx_size = 10 * 1024 * 1024;  // 10MB
+        // calculate the amount of memory required
+        {
+            struct ggml_init_params params;
+            params.mem_size = ctx_size;
+            params.mem_buffer = NULL;
+            params.no_alloc = true;
+            params.dynamic = dynamic;
+
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return false;
+            }
+
+            ggml_set_dynamic(ctx, false);
+            struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
+            struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+            ggml_set_dynamic(ctx, params.dynamic);
+
+            struct ggml_tensor* out = diffusion_model.forward(ctx, x_t, NULL, c, t_emb);
+            ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
+
+            struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
+            cplan = ggml_graph_plan(diffusion_graph, n_threads);
+
+            ctx_size += cplan.work_size;
+            LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB",
+                      ctx_size * 1.0f / 1024 / 1024,
+                      cplan.work_size * 1.0f / 1024 / 1024);
+
+            ggml_free(ctx);
+        }
+
+        struct ggml_init_params params;
+        params.mem_size = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc = false;
+        params.dynamic = dynamic;
+
+        struct ggml_context* ctx = ggml_init(params);
+        if (!ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return false;
+        }
+
+        ggml_set_dynamic(ctx, false);
+        struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
+        struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+        ggml_set_dynamic(ctx, params.dynamic);
+        ggml_set_f32(timesteps, 999);
+        set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
+
+        struct ggml_tensor* out = diffusion_model.forward(ctx, x_t, NULL, c, t_emb);
+        ggml_hold_dynamic_tensor(out);
+
+        struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
+        cplan = ggml_graph_plan(diffusion_graph, n_threads);
+
+        ggml_set_dynamic(ctx, false);
+        struct ggml_tensor* buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
+        ggml_set_dynamic(ctx, params.dynamic);
+
+        cplan.work_data = (uint8_t*)buf->data;
+
+        int64_t t0 = ggml_time_ms();
+        ggml_graph_compute(diffusion_graph, &cplan);
+
+        double result = 0.f;
+
+        {
+            float* vec_x = (float*)x_t->data;
+            float* vec_out = (float*)out->data;
+
+            int64_t n = ggml_nelements(out);
+
+            for (int i = 0; i < n; i++) {
+                result += ((double)vec_out[i] - (double)vec_x[i]);
+            }
+            result /= n;
+        }
+
+#ifdef GGML_PERF
+        ggml_graph_print(&diffusion_graph);
+#endif
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("check is_using_v_parameterization_for_sd2 completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+        LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB",
+                  (ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024,
+                  ctx_size * 1.0f / 1024 / 1024,
+                  ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+        LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
+
+        return result < -1;
+    }
+
+    ggml_tensor* get_learned_condition(ggml_context* res_ctx, const std::string& text) {
+        auto tokens_and_weights = cond_stage_model.tokenize(text,
+                                                            cond_stage_model.text_model.max_position_embeddings,
+                                                            true);
+        std::vector<int>& tokens = tokens_and_weights.first;
+        std::vector<float>& weights = tokens_and_weights.second;
+        struct ggml_cplan cplan;
+        size_t ctx_size = 10 * 1024 * 1024;  // 10MB
+        // calculate the amount of memory required
+        {
+            struct ggml_init_params params;
+            params.mem_size = ctx_size;
+            params.mem_buffer = NULL;
+            params.no_alloc = true;
+            params.dynamic = dynamic;
+
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return NULL;
+            }
+
+            ggml_set_dynamic(ctx, false);
+            struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
+            ggml_set_dynamic(ctx, params.dynamic);
+
+            struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids);
+
+            struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states);
+            cplan = ggml_graph_plan(cond_graph, n_threads);
+            ctx_size += cplan.work_size;
+
+            ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
+            LOG_DEBUG("condition context need %.2fMB static memory, with work_size needing %.2fMB",
+                      ctx_size * 1.0f / 1024 / 1024,
+                      cplan.work_size * 1.0f / 1024 / 1024);
+            ggml_free(ctx);
+        }
+
+        // allocate the required memory and compute forward
+        struct ggml_init_params params;
+        params.mem_size = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc = false;
+        params.dynamic = dynamic;
+
+        struct ggml_context* ctx = ggml_init(params);
+        if (!ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return NULL;
+        }
+
+        ggml_set_dynamic(ctx, false);
+        struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
+        ggml_set_dynamic(ctx, params.dynamic);
+
+        struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids);
+        struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states);
+        LOG_DEBUG("building condition graph completed: %d nodes, %d leafs",
+                  cond_graph->n_nodes, cond_graph->n_leafs);
+
+        memcpy(input_ids->data, tokens.data(), tokens.size() * ggml_element_size(input_ids));
+
+        int64_t t0 = ggml_time_ms();
+        ggml_graph_compute_with_ctx(ctx, cond_graph, n_threads);
+        int64_t t1 = ggml_time_ms();
+        LOG_DEBUG("computing condition graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+
+        ggml_tensor* result = ggml_dup_tensor(res_ctx, hidden_states);  // [N, n_token, hidden_size]
+
+        {
+            int64_t nelements = ggml_nelements(hidden_states);
+            float original_mean = 0.f;
+            float new_mean = 0.f;
+            float* vec = (float*)hidden_states->data;
+            for (int i = 0; i < nelements; i++) {
+                original_mean += vec[i] / nelements * 1.0f;
+            }
+
+            for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
+                for (int i1 = 0; i1 < hidden_states->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < hidden_states->ne[0]; i0++) {
+                        float value = ggml_tensor_get_f32(hidden_states, i0, i1, i2);
+                        value *= weights[i1];
+                        ggml_tensor_set_f32(result, value, i0, i1, i2);
+                    }
+                }
+            }
+
+            vec = (float*)result->data;
+            for (int i = 0; i < nelements; i++) {
+                new_mean += vec[i] / nelements * 1.0f;
+            }
+
+            for (int i = 0; i < nelements; i++) {
+                vec[i] = vec[i] * (original_mean / new_mean);
+            }
+        }
+
+        // print_ggml_tensor(result);
+
+        size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+        if (rt_mem_size > max_rt_mem_size) {
+            max_rt_mem_size = rt_mem_size;
+        }
+        size_t graph_mem_size = ggml_used_mem(clip_params_ctx) + rt_mem_size;
+
+        size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+        if (curr_mem_size > max_mem_size) {
+            max_mem_size = curr_mem_size;
+        }
+
+        LOG_INFO(
+            "condition graph use %.2fMB of memory: params %.2fMB, "
+            "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+            graph_mem_size * 1.0f / 1024 / 1024,
+            ggml_used_mem(clip_params_ctx) * 1.0f / 1024 / 1024,
+            rt_mem_size * 1.0f / 1024 / 1024,
+            ctx_size * 1.0f / 1024 / 1024,
+            ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+
+        LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
+
+        ggml_free(ctx);
+
+        return result;  // [1, 77, 768]
+    }
+
+    ggml_tensor* sample(ggml_context* res_ctx,
+                        ggml_tensor* x_t,
+                        ggml_tensor* c,
+                        ggml_tensor* uc,
+                        float cfg_scale,
+                        SampleMethod method,
+                        const std::vector<float>& sigmas) {
+        size_t steps = sigmas.size() - 1;
+        // x_t = load_tensor_from_file(res_ctx, "./rand0.bin");
+        // print_ggml_tensor(x_t);
+        struct ggml_tensor* x = ggml_dup_tensor(res_ctx, x_t);
+        copy_ggml_tensor(x, x_t);
+        struct ggml_cplan cplan;
+
+        size_t ctx_size = 10 * 1024 * 1024;  // 10MB
+        // calculate the amount of memory required
+        {
+            struct ggml_init_params params;
+            params.mem_size = ctx_size;
+            params.mem_buffer = NULL;
+            params.no_alloc = true;
+            params.dynamic = dynamic;
+
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return NULL;
+            }
+
+            ggml_set_dynamic(ctx, false);
+            struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t);
+            struct ggml_tensor* context = ggml_dup_tensor(ctx, c);
+            struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
+            struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+            ggml_set_dynamic(ctx, params.dynamic);
+
+            struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb);
+            ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
+
+            struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
+            cplan = ggml_graph_plan(diffusion_graph, n_threads);
+
+            ctx_size += cplan.work_size;
+            LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB",
+                      ctx_size * 1.0f / 1024 / 1024,
+                      cplan.work_size * 1.0f / 1024 / 1024);
+
+            ggml_free(ctx);
+        }
+
+        struct ggml_init_params params;
+        params.mem_size = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc = false;
+        params.dynamic = dynamic;
+
+        struct ggml_context* ctx = ggml_init(params);
+        if (!ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return NULL;
+        }
+
+        ggml_set_dynamic(ctx, false);
+        struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t);
+        struct ggml_tensor* context = ggml_dup_tensor(ctx, c);
+        struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
+        struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+        ggml_set_dynamic(ctx, params.dynamic);
+
+        struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb);
+        ggml_hold_dynamic_tensor(out);
+
+        struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
+        cplan = ggml_graph_plan(diffusion_graph, n_threads);
+
+        ggml_set_dynamic(ctx, false);
+        struct ggml_tensor* buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
+        ggml_set_dynamic(ctx, params.dynamic);
+
+        cplan.work_data = (uint8_t*)buf->data;
+
+        // x = x * sigmas[0]
+        {
+            float* vec = (float*)x->data;
+            for (int i = 0; i < ggml_nelements(x); i++) {
+                vec[i] = vec[i] * sigmas[0];
+            }
+        }
+
+        // denoise wrapper
+        ggml_set_dynamic(ctx, false);
+        struct ggml_tensor* out_cond = NULL;
+        struct ggml_tensor* out_uncond = NULL;
+        if (cfg_scale != 1.0f && uc != NULL) {
+            out_uncond = ggml_dup_tensor(ctx, x);
+        }
+        struct ggml_tensor* denoised = ggml_dup_tensor(ctx, x);
+        ggml_set_dynamic(ctx, params.dynamic);
+
+        auto denoise = [&](ggml_tensor* input, float sigma, int step) {
+            int64_t t0 = ggml_time_ms();
+
+            float c_skip = 1.0f;
+            float c_out = 1.0f;
+            float c_in = 1.0f;
+            std::vector<float> scaling = denoiser->get_scalings(sigma);
+            if (scaling.size() == 3) {  // CompVisVDenoiser
+                c_skip = scaling[0];
+                c_out = scaling[1];
+                c_in = scaling[2];
+            } else {  // CompVisDenoiser
+                c_out = scaling[0];
+                c_in = scaling[1];
+            }
+
+            float t = denoiser->schedule->sigma_to_t(sigma);
+            ggml_set_f32(timesteps, t);
+            set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
+
+            copy_ggml_tensor(noised_input, input);
+            // noised_input = noised_input * c_in
+            {
+                float* vec = (float*)noised_input->data;
+                for (int i = 0; i < ggml_nelements(noised_input); i++) {
+                    vec[i] = vec[i] * c_in;
+                }
+            }
+
+            if (cfg_scale != 1.0 && uc != NULL) {
+                // uncond
+                copy_ggml_tensor(context, uc);
+                ggml_graph_compute(diffusion_graph, &cplan);
+                copy_ggml_tensor(out_uncond, out);
+
+                // cond
+                copy_ggml_tensor(context, c);
+                ggml_graph_compute(diffusion_graph, &cplan);
+
+                out_cond = out;
+
+                // out_uncond + cfg_scale * (out_cond - out_uncond)
+                {
+                    float* vec_out = (float*)out->data;
+                    float* vec_out_uncond = (float*)out_uncond->data;
+                    float* vec_out_cond = (float*)out_cond->data;
+
+                    for (int i = 0; i < ggml_nelements(out); i++) {
+                        vec_out[i] = vec_out_uncond[i] + cfg_scale * (vec_out_cond[i] - vec_out_uncond[i]);
+                    }
+                }
+            } else {
+                // cond
+                copy_ggml_tensor(context, c);
+                ggml_graph_compute(diffusion_graph, &cplan);
+            }
+
+            // v = out, eps = out
+            // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
+            {
+                float* vec_denoised = (float*)denoised->data;
+                float* vec_input = (float*)input->data;
+                float* vec_out = (float*)out->data;
+
+                for (int i = 0; i < ggml_nelements(denoised); i++) {
+                    vec_denoised[i] = vec_out[i] * c_out + vec_input[i] * c_skip;
+                }
+            }
+
+#ifdef GGML_PERF
+            ggml_graph_print(&diffusion_graph);
+#endif
+            int64_t t1 = ggml_time_ms();
+            if (step > 0) {
+                LOG_INFO("step %d sampling completed, taking %.2fs", step, (t1 - t0) * 1.0f / 1000);
+                LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB",
+                          (ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024,
+                          ctx_size * 1.0f / 1024 / 1024,
+                          ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+                LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
+            }
+        };
+
+        // sample_euler_ancestral
+        switch (method) {
+            case EULER_A: {
+                LOG_INFO("sampling using Euler A method");
+                ggml_set_dynamic(ctx, false);
+                struct ggml_tensor* noise = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                ggml_set_dynamic(ctx, params.dynamic);
+
+                for (int i = 0; i < steps; i++) {
+                    float sigma = sigmas[i];
+
+                    // denoise
+                    denoise(x, sigma, i + 1);
+
+                    // d = (x - denoised) / sigma
+                    {
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
+
+                        for (int i = 0; i < ggml_nelements(d); i++) {
+                            vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
+                        }
+                    }
+
+                    // get_ancestral_step
+                    float sigma_up = std::min(sigmas[i + 1],
+                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                    float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
+
+                    // Euler method
+                    float dt = sigma_down - sigmas[i];
+                    // x = x + d * dt
+                    {
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+
+                        for (int i = 0; i < ggml_nelements(x); i++) {
+                            vec_x[i] = vec_x[i] + vec_d[i] * dt;
+                        }
+                    }
+
+                    if (sigmas[i + 1] > 0) {
+                        // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+                        ggml_tensor_set_f32_randn(noise, rng);
+                        // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
+                        {
+                            float* vec_x = (float*)x->data;
+                            float* vec_noise = (float*)noise->data;
+
+                            for (int i = 0; i < ggml_nelements(x); i++) {
+                                vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
+                            }
+                        }
+                    }
+                }
+            } break;
+            case EULER:  // Implemented without any sigma churn
+            {
+                LOG_INFO("sampling using Euler method");
+                ggml_set_dynamic(ctx, false);
+                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                ggml_set_dynamic(ctx, params.dynamic);
+
+                for (int i = 0; i < steps; i++) {
+                    float sigma = sigmas[i];
+
+                    // denoise
+                    denoise(x, sigma, i + 1);
+
+                    // d = (x - denoised) / sigma
+                    {
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
+
+                        for (int j = 0; j < ggml_nelements(d); j++) {
+                            vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
+                        }
+                    }
+
+                    float dt = sigmas[i + 1] - sigma;
+                    // x = x + d * dt
+                    {
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                        }
+                    }
+                }
+            } break;
+            case HEUN: {
+                LOG_INFO("sampling using Heun method");
+                ggml_set_dynamic(ctx, false);
+                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x);
+                ggml_set_dynamic(ctx, params.dynamic);
+
+                for (int i = 0; i < steps; i++) {
+                    // denoise
+                    denoise(x, sigmas[i], -(i + 1));
+
+                    // d = (x - denoised) / sigma
+                    {
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
+
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
+                        }
+                    }
+
+                    float dt = sigmas[i + 1] - sigmas[i];
+                    if (sigmas[i + 1] == 0) {
+                        // Euler step
+                        // x = x + d * dt
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                        }
+                    } else {
+                        // Heun step
+                        float* vec_d = (float*)d->data;
+                        float* vec_d2 = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_x2 = (float*)x2->data;
+
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x2[j] = vec_x[j] + vec_d[j] * dt;
+                        }
+
+                        denoise(x2, sigmas[i + 1], i + 1);
+                        float* vec_denoised = (float*)denoised->data;
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
+                            vec_d[j] = (vec_d[j] + d2) / 2;
+                            vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                        }
+                    }
+                }
+            } break;
+            case DPM2: {
+                LOG_INFO("sampling using DPM2 method");
+                ggml_set_dynamic(ctx, false);
+                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x);
+                ggml_set_dynamic(ctx, params.dynamic);
+
+                for (int i = 0; i < steps; i++) {
+                    // denoise
+                    denoise(x, sigmas[i], i + 1);
+
+                    // d = (x - denoised) / sigma
+                    {
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
+
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
+                        }
+                    }
+
+                    if (sigmas[i + 1] == 0) {
+                        // Euler step
+                        // x = x + d * dt
+                        float dt = sigmas[i + 1] - sigmas[i];
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                        }
+                    } else {
+                        // DPM-Solver-2
+                        float sigma_mid = exp(0.5 * (log(sigmas[i]) + log(sigmas[i + 1])));
+                        float dt_1 = sigma_mid - sigmas[i];
+                        float dt_2 = sigmas[i + 1] - sigmas[i];
+
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_x2 = (float*)x2->data;
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
+                        }
+
+                        denoise(x2, sigma_mid, i + 1);
+                        float* vec_denoised = (float*)denoised->data;
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
+                            vec_x[j] = vec_x[j] + d2 * dt_2;
+                        }
+                    }
+                }
+
+            } break;
+            case DPMPP2S_A: {
+                LOG_INFO("sampling using DPM++ (2s) a method");
+                ggml_set_dynamic(ctx, false);
+                struct ggml_tensor* noise = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x);
+                ggml_set_dynamic(ctx, params.dynamic);
+
+                for (int i = 0; i < steps; i++) {
+                    // denoise
+                    denoise(x, sigmas[i], i + 1);
+
+                    // get_ancestral_step
+                    float sigma_up = std::min(sigmas[i + 1],
+                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                    float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
+                    auto t_fn = [](float sigma) -> float { return -log(sigma); };
+                    auto sigma_fn = [](float t) -> float { return exp(-t); };
+
+                    if (sigma_down == 0) {
+                        // Euler step
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
+
+                        for (int j = 0; j < ggml_nelements(d); j++) {
+                            vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
+                        }
+
+                        // TODO: If sigma_down == 0, isn't this wrong?
+                        // But
+                        // https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py#L525
+                        // has this exactly the same way.
+                        float dt = sigma_down - sigmas[i];
+                        for (int j = 0; j < ggml_nelements(d); j++) {
+                            vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                        }
+                    } else {
+                        // DPM-Solver++(2S)
+                        float t = t_fn(sigmas[i]);
+                        float t_next = t_fn(sigma_down);
+                        float h = t_next - t;
+                        float s = t + 0.5 * h;
+
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
+                        float* vec_x2 = (float*)x2->data;
+                        float* vec_denoised = (float*)denoised->data;
+
+                        // First half-step
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5) - 1) * vec_denoised[j];
+                        }
+
+                        denoise(x2, sigmas[i + 1], i + 1);
+
+                        // Second half-step
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x[j] = (sigma_fn(t_next) / sigma_fn(t)) * vec_x[j] - (exp(-h) - 1) * vec_denoised[j];
+                        }
+                    }
+
+                    // Noise addition
+                    if (sigmas[i + 1] > 0) {
+                        ggml_tensor_set_f32_randn(noise, rng);
+                        {
+                            float* vec_x = (float*)x->data;
+                            float* vec_noise = (float*)noise->data;
+
+                            for (int i = 0; i < ggml_nelements(x); i++) {
+                                vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
+                            }
+                        }
+                    }
+                }
+            } break;
+            case DPMPP2M:  // DPM++ (2M) from Karras et al (2022)
+            {
+                LOG_INFO("sampling using DPM++ (2M) method");
+                ggml_set_dynamic(ctx, false);
+                struct ggml_tensor* old_denoised = ggml_dup_tensor(ctx, x);
+                ggml_set_dynamic(ctx, params.dynamic);
+
+                auto t_fn = [](float sigma) -> float { return -log(sigma); };
+
+                for (int i = 0; i < steps; i++) {
+                    // denoise
+                    denoise(x, sigmas[i], i + 1);
+
+                    float t = t_fn(sigmas[i]);
+                    float t_next = t_fn(sigmas[i + 1]);
+                    float h = t_next - t;
+                    float a = sigmas[i + 1] / sigmas[i];
+                    float b = exp(-h) - 1.;
+                    float* vec_x = (float*)x->data;
+                    float* vec_denoised = (float*)denoised->data;
+                    float* vec_old_denoised = (float*)old_denoised->data;
+
+                    if (i == 0 || sigmas[i + 1] == 0) {
+                        // Simpler step for the edge cases
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
+                        }
+                    } else {
+                        float h_last = t - t_fn(sigmas[i - 1]);
+                        float r = h_last / h;
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            float denoised_d = (1. + 1. / (2. * r)) * vec_denoised[j] - (1. / (2. * r)) * vec_old_denoised[j];
+                            vec_x[j] = a * vec_x[j] - b * denoised_d;
+                        }
+                    }
+
+                    // old_denoised = denoised
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_old_denoised[j] = vec_denoised[j];
+                    }
+                }
+            } break;
+            case DPMPP2Mv2:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
+            {
+                LOG_INFO("sampling using modified DPM++ (2M) method");
+                ggml_set_dynamic(ctx, false);
+                struct ggml_tensor* old_denoised = ggml_dup_tensor(ctx, x);
+                ggml_set_dynamic(ctx, params.dynamic);
+
+                auto t_fn = [](float sigma) -> float { return -log(sigma); };
+
+                for (int i = 0; i < steps; i++) {
+                    // denoise
+                    denoise(x, sigmas[i], i + 1);
+
+                    float t = t_fn(sigmas[i]);
+                    float t_next = t_fn(sigmas[i + 1]);
+                    float h = t_next - t;
+                    float a = sigmas[i + 1] / sigmas[i];
+                    float* vec_x = (float*)x->data;
+                    float* vec_denoised = (float*)denoised->data;
+                    float* vec_old_denoised = (float*)old_denoised->data;
+
+                    if (i == 0 || sigmas[i + 1] == 0) {
+                        // Simpler step for the edge cases
+                        float b = exp(-h) - 1.;
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
+                        }
+                    } else {
+                        float h_last = t - t_fn(sigmas[i - 1]);
+                        float h_min = std::min(h_last, h);
+                        float h_max = std::max(h_last, h);
+                        float r = h_max / h_min;
+                        float h_d = (h_max + h_min) / 2.;
+                        float b = exp(-h_d) - 1.;
+                        for (int j = 0; j < ggml_nelements(x); j++) {
+                            float denoised_d = (1. + 1. / (2. * r)) * vec_denoised[j] - (1. / (2. * r)) * vec_old_denoised[j];
+                            vec_x[j] = a * vec_x[j] - b * denoised_d;
+                        }
+                    }
+
+                    // old_denoised = denoised
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_old_denoised[j] = vec_denoised[j];
+                    }
+                }
+            } break;
+
+            default:
+                LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
+                abort();
+        }
+
+        size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+        if (rt_mem_size > max_rt_mem_size) {
+            max_rt_mem_size = rt_mem_size;
+        }
+        size_t graph_mem_size = ggml_used_mem(unet_params_ctx) + rt_mem_size;
+
+        size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+        if (curr_mem_size > max_mem_size) {
+            max_mem_size = curr_mem_size;
+        }
+
+        LOG_INFO(
+            "diffusion graph use %.2fMB of memory: params %.2fMB, "
+            "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+            graph_mem_size * 1.0f / 1024 / 1024,
+            ggml_used_mem(unet_params_ctx) * 1.0f / 1024 / 1024,
+            rt_mem_size * 1.0f / 1024 / 1024,
+            ctx_size * 1.0f / 1024 / 1024,
+            ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+        LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
+
+        ggml_free(ctx);
+
+        return x;
+    }
+
+    ggml_tensor* encode_first_stage(ggml_context* res_ctx, ggml_tensor* x) {
+        int64_t W = x->ne[0];
+        int64_t H = x->ne[1];
+        struct ggml_tensor* result = NULL;
+        struct ggml_cplan cplan;
+
+        // calculate the amount of memory required
+        size_t ctx_size = 10 * 1024 * 1024;  // 10MB
+        {
+            struct ggml_init_params params;
+            params.mem_size = ctx_size;
+            params.mem_buffer = NULL;
+            params.no_alloc = true;
+            params.dynamic = dynamic;
+
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return NULL;
+            }
+
+            struct ggml_tensor* moments = first_stage_model.encode(ctx, x);
+            ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
+
+            struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, moments);
+            cplan = ggml_graph_plan(vae_graph, n_threads);
+
+            ctx_size += cplan.work_size;
+            LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB",
+                      ctx_size * 1.0f / 1024 / 1024,
+                      cplan.work_size * 1.0f / 1024 / 1024);
+
+            ggml_free(ctx);
+        }
+
+        {
+            struct ggml_init_params params;
+            params.mem_size = ctx_size;
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = dynamic;
+
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return NULL;
+            }
+
+            struct ggml_tensor* moments = first_stage_model.encode(ctx, x);
+            struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, moments);
+
+            int64_t t0 = ggml_time_ms();
+            ggml_graph_compute_with_ctx(ctx, vae_graph, n_threads);
+            int64_t t1 = ggml_time_ms();
+
+#ifdef GGML_PERF
+            ggml_graph_print(&vae_graph);
+#endif
+            LOG_DEBUG("computing vae graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+
+            result = ggml_dup_tensor(res_ctx, moments);
+            copy_ggml_tensor(result, moments);
+
+            size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+            if (rt_mem_size > max_rt_mem_size) {
+                max_rt_mem_size = rt_mem_size;
+            }
+            size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
+
+            size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+            if (curr_mem_size > max_mem_size) {
+                max_mem_size = curr_mem_size;
+            }
+
+            LOG_INFO(
+                "vae graph use %.2fMB of memory: params %.2fMB, "
+                "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+                graph_mem_size * 1.0f / 1024 / 1024,
+                ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
+                rt_mem_size * 1.0f / 1024 / 1024,
+                ctx_size * 1.0f / 1024 / 1024,
+                ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+            LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
+
+            ggml_free(ctx);
+        }
+
+        return result;
+    }
+
+    // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
+    ggml_tensor* get_first_stage_encoding(ggml_context* res_ctx, ggml_tensor* moments) {
+        // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
+        ggml_tensor* latent = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0],
+                                                 moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
+        struct ggml_tensor* noise = ggml_dup_tensor(res_ctx, latent);
+        ggml_tensor_set_f32_randn(noise, rng);
+        // noise = load_tensor_from_file(res_ctx, "noise.bin");
+        {
+            float mean = 0;
+            float logvar = 0;
+            float value = 0;
+            float std_ = 0;
+            for (int i = 0; i < latent->ne[3]; i++) {
+                for (int j = 0; j < latent->ne[2]; j++) {
+                    for (int k = 0; k < latent->ne[1]; k++) {
+                        for (int l = 0; l < latent->ne[0]; l++) {
+                            mean = ggml_tensor_get_f32(moments, l, k, j, i);
+                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
+                            logvar = std::max(-30.0f, std::min(logvar, 20.0f));
+                            std_ = std::exp(0.5f * logvar);
+                            value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
+                            value = value * scale_factor;
+                            // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
+                            ggml_tensor_set_f32(latent, value, l, k, j, i);
+                        }
+                    }
+                }
+            }
+        }
+        return latent;
+    }
+
+    ggml_tensor* decode_first_stage(ggml_context* res_ctx, ggml_tensor* z) {
+        int64_t W = z->ne[0];
+        int64_t H = z->ne[1];
+        struct ggml_tensor* result_img = NULL;
+        struct ggml_cplan cplan;
+
+        {
+            float* vec = (float*)z->data;
+            for (int i = 0; i < ggml_nelements(z); i++) {
+                vec[i] = 1.0f / scale_factor * vec[i];
+            }
+        }
+
+        // calculate the amount of memory required
+        size_t ctx_size = 10 * 1024 * 1024;  // 10MB
+        {
+            struct ggml_init_params params;
+            params.mem_size = ctx_size;
+            params.mem_buffer = NULL;
+            params.no_alloc = true;
+            params.dynamic = dynamic;
+
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return NULL;
+            }
+
+            struct ggml_tensor* img = first_stage_model.decoder.forward(ctx, z);
+            ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
+
+            struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, img);
+            cplan = ggml_graph_plan(vae_graph, n_threads);
+
+            ctx_size += cplan.work_size;
+            LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB",
+                      ctx_size * 1.0f / 1024 / 1024,
+                      cplan.work_size * 1.0f / 1024 / 1024);
+
+            ggml_free(ctx);
+        }
+
+        {
+            struct ggml_init_params params;
+            params.mem_size = ctx_size;
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = dynamic;
+
+            struct ggml_context* ctx = ggml_init(params);
+            if (!ctx) {
+                LOG_ERROR("ggml_init() failed");
+                return NULL;
+            }
+
+            struct ggml_tensor* img = first_stage_model.decode(ctx, z);
+            struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, img);
+
+            int64_t t0 = ggml_time_ms();
+            ggml_graph_compute_with_ctx(ctx, vae_graph, n_threads);
+            int64_t t1 = ggml_time_ms();
+
+#ifdef GGML_PERF
+            ggml_graph_print(&vae_graph);
+#endif
+            LOG_DEBUG("computing vae graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+
+            result_img = ggml_dup_tensor(res_ctx, img);
+            copy_ggml_tensor(result_img, img);
+
+            size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+            if (rt_mem_size > max_rt_mem_size) {
+                max_rt_mem_size = rt_mem_size;
+            }
+            size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
+
+            size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+            if (curr_mem_size > max_mem_size) {
+                max_mem_size = curr_mem_size;
+            }
+
+            LOG_INFO(
+                "vae graph use %.2fMB of memory: params %.2fMB, "
+                "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+                graph_mem_size * 1.0f / 1024 / 1024,
+                ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
+                rt_mem_size * 1.0f / 1024 / 1024,
+                ctx_size * 1.0f / 1024 / 1024,
+                ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+            LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
+
+            ggml_free(ctx);
+        }
+
+        return result_img;
+    }
+};
+
+/*================================================= StableDiffusion ==================================================*/
+
+StableDiffusion::StableDiffusion(int n_threads,
+                                 bool vae_decode_only,
+                                 bool free_params_immediately,
+                                 RNGType rng_type) {
+    sd = std::make_shared<StableDiffusionGGML>(n_threads,
+                                               vae_decode_only,
+                                               free_params_immediately,
+                                               rng_type);
+}
+
+bool StableDiffusion::load_from_file(const std::string& file_path, Schedule s) {
+    return sd->load_from_file(file_path, s);
+}
+
+std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
+                                              const std::string& negative_prompt,
+                                              float cfg_scale,
+                                              int width,
+                                              int height,
+                                              SampleMethod sample_method,
+                                              int sample_steps,
+                                              int64_t seed) {
+    std::vector<uint8_t> result;
+    struct ggml_init_params params;
+    params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+    params.mem_size += width * height * 3 * sizeof(float) * 2;
+    params.mem_buffer = NULL;
+    params.no_alloc = false;
+    params.dynamic = false;
+    struct ggml_context* ctx = ggml_init(params);
+    if (!ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return result;
+    }
+
+    if (seed < 0) {
+        seed = (int)time(NULL);
+    }
+    sd->rng->manual_seed(seed);
+
+    int64_t t0 = ggml_time_ms();
+    ggml_tensor* c = sd->get_learned_condition(ctx, prompt);
+    struct ggml_tensor* uc = NULL;
+    if (cfg_scale != 1.0) {
+        uc = sd->get_learned_condition(ctx, negative_prompt);
+    }
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
+        ggml_free(sd->clip_params_ctx);
+        sd->clip_params_ctx = NULL;
+    }
+
+    int C = 4;
+    int W = width / 8;
+    int H = height / 8;
+    struct ggml_tensor* x_t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, W, H, C, 1);
+    ggml_tensor_set_f32_randn(x_t, sd->rng);
+
+    std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
+
+    LOG_INFO("start sampling");
+    struct ggml_tensor* x_0 = sd->sample(ctx, x_t, c, uc, cfg_scale, sample_method, sigmas);
+    // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
+    // print_ggml_tensor(x_0);
+    int64_t t2 = ggml_time_ms();
+    LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
+
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
+        ggml_free(sd->unet_params_ctx);
+        sd->unet_params_ctx = NULL;
+    }
+
+    struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
+    if (img != NULL) {
+        result = ggml_to_image_vec(img);
+    }
+    int64_t t3 = ggml_time_ms();
+    LOG_INFO("decode_first_stage completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
+
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
+        ggml_free(sd->vae_params_ctx);
+        sd->vae_params_ctx = NULL;
+    }
+
+    LOG_INFO(
+        "txt2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
+        "peak runtime memory %.2fMB",
+        (t3 - t0) * 1.0f / 1000,
+        sd->max_mem_size * 1.0f / 1024 / 1024,
+        sd->max_params_mem_size * 1.0f / 1024 / 1024,
+        sd->max_rt_mem_size * 1.0f / 1024 / 1024);
+
+    ggml_free(ctx);
+    return result;
+}
+
+std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_img_vec,
+                                              const std::string& prompt,
+                                              const std::string& negative_prompt,
+                                              float cfg_scale,
+                                              int width,
+                                              int height,
+                                              SampleMethod sample_method,
+                                              int sample_steps,
+                                              float strength,
+                                              int64_t seed) {
+    std::vector<uint8_t> result;
+    if (init_img_vec.size() != width * height * 3) {
+        return result;
+    }
+    LOG_INFO("img2img %dx%d", width, height);
+
+    std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
+    size_t t_enc = static_cast<size_t>(sample_steps * strength);
+    LOG_INFO("target t_enc is %zu steps", t_enc);
+    std::vector<float> sigma_sched;
+    sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
+
+    struct ggml_init_params params;
+    params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+    params.mem_size += width * height * 3 * sizeof(float) * 2;
+    params.mem_buffer = NULL;
+    params.no_alloc = false;
+    params.dynamic = false;
+    struct ggml_context* ctx = ggml_init(params);
+    if (!ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return result;
+    }
+
+    if (seed < 0) {
+        seed = (int)time(NULL);
+    }
+    sd->rng->manual_seed(seed);
+
+    ggml_tensor* init_img = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, width, height, 3, 1);
+    image_vec_to_ggml(init_img_vec, init_img);
+
+    int64_t t0 = ggml_time_ms();
+    ggml_tensor* moments = sd->encode_first_stage(ctx, init_img);
+    ggml_tensor* init_latent = sd->get_first_stage_encoding(ctx, moments);
+    // print_ggml_tensor(init_latent);
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+
+    ggml_reset_curr_max_dynamic_size();  // reset counter
+
+    ggml_tensor* c = sd->get_learned_condition(ctx, prompt);
+    struct ggml_tensor* uc = NULL;
+    if (cfg_scale != 1.0) {
+        uc = sd->get_learned_condition(ctx, negative_prompt);
+    }
+    int64_t t2 = ggml_time_ms();
+    LOG_INFO("get_learned_condition completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
+        ggml_free(sd->clip_params_ctx);
+        sd->clip_params_ctx = NULL;
+    }
+
+    LOG_INFO("start sampling");
+    struct ggml_tensor* x_0 = sd->sample(ctx, init_latent, c, uc, cfg_scale, sample_method, sigma_sched);
+    // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
+    // print_ggml_tensor(x_0);
+    int64_t t3 = ggml_time_ms();
+    LOG_INFO("sampling completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
+        ggml_free(sd->unet_params_ctx);
+        sd->unet_params_ctx = NULL;
+    }
+
+    struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
+    if (img != NULL) {
+        result = ggml_to_image_vec(img);
+    }
+    int64_t t4 = ggml_time_ms();
+    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
+
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
+        ggml_free(sd->vae_params_ctx);
+        sd->vae_params_ctx = NULL;
+    }
+
+    LOG_INFO(
+        "img2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
+        "peak runtime memory %.2fMB",
+        (t4 - t0) * 1.0f / 1000,
+        sd->max_mem_size * 1.0f / 1024 / 1024,
+        sd->max_params_mem_size * 1.0f / 1024 / 1024,
+        sd->max_rt_mem_size * 1.0f / 1024 / 1024);
+
+    ggml_free(ctx);
+
+    return result;
+}
diff --git a/stable-diffusion.cpp/stable-diffusion.h b/stable-diffusion.cpp/stable-diffusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..728793c0e962945c8834c88a621e6b7d979ae5e3
--- /dev/null
+++ b/stable-diffusion.cpp/stable-diffusion.h
@@ -0,0 +1,74 @@
+#ifndef __STABLE_DIFFUSION_H__
+#define __STABLE_DIFFUSION_H__
+
+#include <memory>
+#include <vector>
+
+enum SDLogLevel {
+    DEBUG,
+    INFO,
+    WARN,
+    ERROR
+};
+
+enum RNGType {
+    STD_DEFAULT_RNG,
+    CUDA_RNG
+};
+
+enum SampleMethod {
+    EULER_A,
+    EULER,
+    HEUN,
+    DPM2,
+    DPMPP2S_A,
+    DPMPP2M,
+    DPMPP2Mv2,
+    N_SAMPLE_METHODS
+};
+
+enum Schedule {
+    DEFAULT,
+    DISCRETE,
+    KARRAS,
+    N_SCHEDULES
+};
+
+class StableDiffusionGGML;
+
+class StableDiffusion {
+   private:
+    std::shared_ptr<StableDiffusionGGML> sd;
+
+   public:
+    StableDiffusion(int n_threads = -1,
+                    bool vae_decode_only = false,
+                    bool free_params_immediately = false,
+                    RNGType rng_type = STD_DEFAULT_RNG);
+    bool load_from_file(const std::string& file_path, Schedule d = DEFAULT);
+    std::vector<uint8_t> txt2img(
+        const std::string& prompt,
+        const std::string& negative_prompt,
+        float cfg_scale,
+        int width,
+        int height,
+        SampleMethod sample_method,
+        int sample_steps,
+        int64_t seed);
+    std::vector<uint8_t> img2img(
+        const std::vector<uint8_t>& init_img,
+        const std::string& prompt,
+        const std::string& negative_prompt,
+        float cfg_scale,
+        int width,
+        int height,
+        SampleMethod sample_method,
+        int sample_steps,
+        float strength,
+        int64_t seed);
+};
+
+void set_sd_log_level(SDLogLevel level);
+std::string sd_get_system_info();
+
+#endif  // __STABLE_DIFFUSION_H__
\ No newline at end of file