Spaces:
Build error
Build error
Illumotion
commited on
Commit
•
f57d7c6
1
Parent(s):
411033d
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .devops/cloud-v-pipeline +22 -0
- .devops/full-cuda.Dockerfile +1 -1
- .devops/full-rocm.Dockerfile +44 -0
- .devops/llama-cpp-clblast.srpm.spec +84 -0
- .devops/llama-cpp-cublas.srpm.spec +83 -0
- .devops/llama-cpp.srpm.spec +85 -0
- .devops/main-cuda.Dockerfile +1 -1
- .devops/main-rocm.Dockerfile +44 -0
- .editorconfig +3 -0
- .github/workflows/code-coverage.yml +36 -0
- .github/workflows/gguf-publish.yml +43 -0
- .gitignore +27 -30
- CMakeLists.txt +116 -20
- Dockerfile +2 -1
- MIT_LICENSE_GGML_LLAMACPP_ONLY +1 -1
- Makefile +89 -38
- Package.swift +35 -4
- README.md +1 -1
- build-info.h +2 -0
- ci/run.sh +141 -44
- class.py +313 -0
- codecov.yml +14 -0
- colab.ipynb +61 -0
- common/CMakeLists.txt +20 -0
- common/common.cpp +1270 -0
- common/common.h +211 -0
- common/console.cpp +501 -0
- common/console.h +19 -0
- common/grammar-parser.cpp +424 -0
- common/grammar-parser.h +29 -0
- common/log.h +643 -0
- convert-baichuan-hf-to-gguf.py +304 -0
- convert-falcon-hf-to-gguf.py +281 -0
- convert-gptneox-hf-to-gguf.py +251 -0
- convert-llama-ggml-to-gguf.py +451 -0
- convert-lora-to-ggml.py +22 -19
- convert-starcoder-hf-to-gguf.py +248 -0
- convert.py +638 -756
- docs/token_generation_performance_tips.md +3 -3
- examples/CMakeLists.txt +4 -21
- examples/baby-llama/baby-llama.cpp +77 -76
- examples/beam-search/CMakeLists.txt +5 -0
- examples/beam-search/beam-search.cpp +186 -0
- examples/benchmark/CMakeLists.txt +2 -1
- examples/benchmark/benchmark-matmult.cpp +23 -20
- examples/chat.sh +1 -1
- examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
- examples/convert-llama2c-to-ggml/README.md +26 -0
- examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +963 -0
- examples/embd-input/embd-input-lib.cpp +8 -11
.devops/cloud-v-pipeline
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
|
2 |
+
stage('Cleanup'){
|
3 |
+
cleanWs() // Cleaning previous CI build in workspace
|
4 |
+
}
|
5 |
+
stage('checkout repo'){
|
6 |
+
retry(5){ // Retry if the cloning fails due to some reason
|
7 |
+
checkout scm // Clone the repo on Runner
|
8 |
+
}
|
9 |
+
}
|
10 |
+
stage('Compiling llama.cpp'){
|
11 |
+
sh'''#!/bin/bash
|
12 |
+
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
13 |
+
'''
|
14 |
+
}
|
15 |
+
stage('Running llama.cpp'){
|
16 |
+
sh'''#!/bin/bash
|
17 |
+
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
18 |
+
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
19 |
+
cat llama_log.txt # Printing results
|
20 |
+
'''
|
21 |
+
}
|
22 |
+
}
|
.devops/full-cuda.Dockerfile
CHANGED
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
|
12 |
ARG CUDA_DOCKER_ARCH=all
|
13 |
|
14 |
RUN apt-get update && \
|
15 |
-
apt-get install -y build-essential python3 python3-pip
|
16 |
|
17 |
COPY requirements.txt requirements.txt
|
18 |
|
|
|
12 |
ARG CUDA_DOCKER_ARCH=all
|
13 |
|
14 |
RUN apt-get update && \
|
15 |
+
apt-get install -y build-essential python3 python3-pip git
|
16 |
|
17 |
COPY requirements.txt requirements.txt
|
18 |
|
.devops/full-rocm.Dockerfile
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARG UBUNTU_VERSION=22.04
|
2 |
+
|
3 |
+
# This needs to generally match the container host's environment.
|
4 |
+
ARG ROCM_VERSION=5.6
|
5 |
+
|
6 |
+
# Target the CUDA build image
|
7 |
+
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
8 |
+
|
9 |
+
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
10 |
+
|
11 |
+
# Unless otherwise specified, we make a fat build.
|
12 |
+
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
13 |
+
# This is mostly tied to rocBLAS supported archs.
|
14 |
+
ARG ROCM_DOCKER_ARCH=\
|
15 |
+
gfx803 \
|
16 |
+
gfx900 \
|
17 |
+
gfx906 \
|
18 |
+
gfx908 \
|
19 |
+
gfx90a \
|
20 |
+
gfx1010 \
|
21 |
+
gfx1030 \
|
22 |
+
gfx1100 \
|
23 |
+
gfx1101 \
|
24 |
+
gfx1102
|
25 |
+
|
26 |
+
COPY requirements.txt requirements.txt
|
27 |
+
|
28 |
+
RUN pip install --upgrade pip setuptools wheel \
|
29 |
+
&& pip install -r requirements.txt
|
30 |
+
|
31 |
+
WORKDIR /app
|
32 |
+
|
33 |
+
COPY . .
|
34 |
+
|
35 |
+
# Set nvcc architecture
|
36 |
+
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
37 |
+
# Enable ROCm
|
38 |
+
ENV LLAMA_HIPBLAS=1
|
39 |
+
ENV CC=/opt/rocm/llvm/bin/clang
|
40 |
+
ENV CXX=/opt/rocm/llvm/bin/clang++
|
41 |
+
|
42 |
+
RUN make
|
43 |
+
|
44 |
+
ENTRYPOINT ["/app/.devops/tools.sh"]
|
.devops/llama-cpp-clblast.srpm.spec
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
2 |
+
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
3 |
+
# Built and maintained by John Boero - boeroboy@gmail.com
|
4 |
+
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
5 |
+
|
6 |
+
# Notes for llama.cpp:
|
7 |
+
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
8 |
+
# We need to declare standard versioning if people want to sort latest releases.
|
9 |
+
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
10 |
+
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
11 |
+
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
12 |
+
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
13 |
+
# It is up to the user to install the correct vendor-specific support.
|
14 |
+
|
15 |
+
Name: llama.cpp-clblast
|
16 |
+
Version: %( date "+%%Y%%m%%d" )
|
17 |
+
Release: 1%{?dist}
|
18 |
+
Summary: OpenCL Inference of LLaMA model in C/C++
|
19 |
+
License: MIT
|
20 |
+
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
21 |
+
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
|
22 |
+
Requires: clblast
|
23 |
+
URL: https://github.com/ggerganov/llama.cpp
|
24 |
+
|
25 |
+
%define debug_package %{nil}
|
26 |
+
%define source_date_epoch_from_changelog 0
|
27 |
+
|
28 |
+
%description
|
29 |
+
CPU inference for Meta's Lllama2 models using default options.
|
30 |
+
|
31 |
+
%prep
|
32 |
+
%setup -n llama.cpp-master
|
33 |
+
|
34 |
+
%build
|
35 |
+
make -j LLAMA_CLBLAST=1
|
36 |
+
|
37 |
+
%install
|
38 |
+
mkdir -p %{buildroot}%{_bindir}/
|
39 |
+
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
40 |
+
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
41 |
+
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
42 |
+
|
43 |
+
mkdir -p %{buildroot}/usr/lib/systemd/system
|
44 |
+
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
45 |
+
[Unit]
|
46 |
+
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
47 |
+
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
48 |
+
|
49 |
+
[Service]
|
50 |
+
Type=simple
|
51 |
+
EnvironmentFile=/etc/sysconfig/llama
|
52 |
+
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
53 |
+
ExecReload=/bin/kill -s HUP $MAINPID
|
54 |
+
Restart=never
|
55 |
+
|
56 |
+
[Install]
|
57 |
+
WantedBy=default.target
|
58 |
+
EOF
|
59 |
+
|
60 |
+
mkdir -p %{buildroot}/etc/sysconfig
|
61 |
+
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
62 |
+
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
63 |
+
EOF
|
64 |
+
|
65 |
+
%clean
|
66 |
+
rm -rf %{buildroot}
|
67 |
+
rm -rf %{_builddir}/*
|
68 |
+
|
69 |
+
%files
|
70 |
+
%{_bindir}/llamaclblast
|
71 |
+
%{_bindir}/llamaclblastserver
|
72 |
+
%{_bindir}/llamaclblastsimple
|
73 |
+
/usr/lib/systemd/system/llamaclblast.service
|
74 |
+
%config /etc/sysconfig/llama
|
75 |
+
|
76 |
+
|
77 |
+
%pre
|
78 |
+
|
79 |
+
%post
|
80 |
+
|
81 |
+
%preun
|
82 |
+
%postun
|
83 |
+
|
84 |
+
%changelog
|
.devops/llama-cpp-cublas.srpm.spec
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
2 |
+
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
3 |
+
# Built and maintained by John Boero - boeroboy@gmail.com
|
4 |
+
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
5 |
+
|
6 |
+
# Notes for llama.cpp:
|
7 |
+
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
8 |
+
# We need to declare standard versioning if people want to sort latest releases.
|
9 |
+
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
10 |
+
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
11 |
+
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
12 |
+
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
13 |
+
# It is up to the user to install the correct vendor-specific support.
|
14 |
+
|
15 |
+
Name: llama.cpp-cublas
|
16 |
+
Version: %( date "+%%Y%%m%%d" )
|
17 |
+
Release: 1%{?dist}
|
18 |
+
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
19 |
+
License: MIT
|
20 |
+
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
21 |
+
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
22 |
+
Requires: cuda-toolkit
|
23 |
+
URL: https://github.com/ggerganov/llama.cpp
|
24 |
+
|
25 |
+
%define debug_package %{nil}
|
26 |
+
%define source_date_epoch_from_changelog 0
|
27 |
+
|
28 |
+
%description
|
29 |
+
CPU inference for Meta's Lllama2 models using default options.
|
30 |
+
|
31 |
+
%prep
|
32 |
+
%setup -n llama.cpp-master
|
33 |
+
|
34 |
+
%build
|
35 |
+
make -j LLAMA_CUBLAS=1
|
36 |
+
|
37 |
+
%install
|
38 |
+
mkdir -p %{buildroot}%{_bindir}/
|
39 |
+
cp -p main %{buildroot}%{_bindir}/llamacppcublas
|
40 |
+
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
|
41 |
+
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
|
42 |
+
|
43 |
+
mkdir -p %{buildroot}/usr/lib/systemd/system
|
44 |
+
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
|
45 |
+
[Unit]
|
46 |
+
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
47 |
+
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
48 |
+
|
49 |
+
[Service]
|
50 |
+
Type=simple
|
51 |
+
EnvironmentFile=/etc/sysconfig/llama
|
52 |
+
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
|
53 |
+
ExecReload=/bin/kill -s HUP $MAINPID
|
54 |
+
Restart=never
|
55 |
+
|
56 |
+
[Install]
|
57 |
+
WantedBy=default.target
|
58 |
+
EOF
|
59 |
+
|
60 |
+
mkdir -p %{buildroot}/etc/sysconfig
|
61 |
+
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
62 |
+
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
63 |
+
EOF
|
64 |
+
|
65 |
+
%clean
|
66 |
+
rm -rf %{buildroot}
|
67 |
+
rm -rf %{_builddir}/*
|
68 |
+
|
69 |
+
%files
|
70 |
+
%{_bindir}/llamacppcublas
|
71 |
+
%{_bindir}/llamacppcublasserver
|
72 |
+
%{_bindir}/llamacppcublassimple
|
73 |
+
/usr/lib/systemd/system/llamacublas.service
|
74 |
+
%config /etc/sysconfig/llama
|
75 |
+
|
76 |
+
%pre
|
77 |
+
|
78 |
+
%post
|
79 |
+
|
80 |
+
%preun
|
81 |
+
%postun
|
82 |
+
|
83 |
+
%changelog
|
.devops/llama-cpp.srpm.spec
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
2 |
+
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
3 |
+
# Built and maintained by John Boero - boeroboy@gmail.com
|
4 |
+
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
5 |
+
|
6 |
+
# Notes for llama.cpp:
|
7 |
+
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
8 |
+
# We need to declare standard versioning if people want to sort latest releases.
|
9 |
+
# In the meantime, YYYYMMDD format will be used.
|
10 |
+
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
11 |
+
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
12 |
+
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
13 |
+
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
14 |
+
# It is up to the user to install the correct vendor-specific support.
|
15 |
+
|
16 |
+
Name: llama.cpp
|
17 |
+
Version: %( date "+%%Y%%m%%d" )
|
18 |
+
Release: 1%{?dist}
|
19 |
+
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
20 |
+
License: MIT
|
21 |
+
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
22 |
+
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
|
23 |
+
Requires: libstdc++
|
24 |
+
URL: https://github.com/ggerganov/llama.cpp
|
25 |
+
|
26 |
+
%define debug_package %{nil}
|
27 |
+
%define source_date_epoch_from_changelog 0
|
28 |
+
|
29 |
+
%description
|
30 |
+
CPU inference for Meta's Lllama2 models using default options.
|
31 |
+
Models are not included in this package and must be downloaded separately.
|
32 |
+
|
33 |
+
%prep
|
34 |
+
%setup -n llama.cpp-master
|
35 |
+
|
36 |
+
%build
|
37 |
+
make -j
|
38 |
+
|
39 |
+
%install
|
40 |
+
mkdir -p %{buildroot}%{_bindir}/
|
41 |
+
cp -p main %{buildroot}%{_bindir}/llama
|
42 |
+
cp -p server %{buildroot}%{_bindir}/llamaserver
|
43 |
+
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
44 |
+
|
45 |
+
mkdir -p %{buildroot}/usr/lib/systemd/system
|
46 |
+
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
47 |
+
[Unit]
|
48 |
+
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
49 |
+
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
50 |
+
|
51 |
+
[Service]
|
52 |
+
Type=simple
|
53 |
+
EnvironmentFile=/etc/sysconfig/llama
|
54 |
+
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
55 |
+
ExecReload=/bin/kill -s HUP $MAINPID
|
56 |
+
Restart=never
|
57 |
+
|
58 |
+
[Install]
|
59 |
+
WantedBy=default.target
|
60 |
+
EOF
|
61 |
+
|
62 |
+
mkdir -p %{buildroot}/etc/sysconfig
|
63 |
+
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
64 |
+
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
65 |
+
EOF
|
66 |
+
|
67 |
+
%clean
|
68 |
+
rm -rf %{buildroot}
|
69 |
+
rm -rf %{_builddir}/*
|
70 |
+
|
71 |
+
%files
|
72 |
+
%{_bindir}/llama
|
73 |
+
%{_bindir}/llamaserver
|
74 |
+
%{_bindir}/llamasimple
|
75 |
+
/usr/lib/systemd/system/llama.service
|
76 |
+
%config /etc/sysconfig/llama
|
77 |
+
|
78 |
+
%pre
|
79 |
+
|
80 |
+
%post
|
81 |
+
|
82 |
+
%preun
|
83 |
+
%postun
|
84 |
+
|
85 |
+
%changelog
|
.devops/main-cuda.Dockerfile
CHANGED
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
|
12 |
ARG CUDA_DOCKER_ARCH=all
|
13 |
|
14 |
RUN apt-get update && \
|
15 |
-
apt-get install -y build-essential
|
16 |
|
17 |
WORKDIR /app
|
18 |
|
|
|
12 |
ARG CUDA_DOCKER_ARCH=all
|
13 |
|
14 |
RUN apt-get update && \
|
15 |
+
apt-get install -y build-essential git
|
16 |
|
17 |
WORKDIR /app
|
18 |
|
.devops/main-rocm.Dockerfile
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARG UBUNTU_VERSION=22.04
|
2 |
+
|
3 |
+
# This needs to generally match the container host's environment.
|
4 |
+
ARG ROCM_VERSION=5.6
|
5 |
+
|
6 |
+
# Target the CUDA build image
|
7 |
+
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
8 |
+
|
9 |
+
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
10 |
+
|
11 |
+
# Unless otherwise specified, we make a fat build.
|
12 |
+
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
13 |
+
# This is mostly tied to rocBLAS supported archs.
|
14 |
+
ARG ROCM_DOCKER_ARCH=\
|
15 |
+
gfx803 \
|
16 |
+
gfx900 \
|
17 |
+
gfx906 \
|
18 |
+
gfx908 \
|
19 |
+
gfx90a \
|
20 |
+
gfx1010 \
|
21 |
+
gfx1030 \
|
22 |
+
gfx1100 \
|
23 |
+
gfx1101 \
|
24 |
+
gfx1102
|
25 |
+
|
26 |
+
COPY requirements.txt requirements.txt
|
27 |
+
|
28 |
+
RUN pip install --upgrade pip setuptools wheel \
|
29 |
+
&& pip install -r requirements.txt
|
30 |
+
|
31 |
+
WORKDIR /app
|
32 |
+
|
33 |
+
COPY . .
|
34 |
+
|
35 |
+
# Set nvcc architecture
|
36 |
+
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
37 |
+
# Enable ROCm
|
38 |
+
ENV LLAMA_HIPBLAS=1
|
39 |
+
ENV CC=/opt/rocm/llvm/bin/clang
|
40 |
+
ENV CXX=/opt/rocm/llvm/bin/clang++
|
41 |
+
|
42 |
+
RUN make
|
43 |
+
|
44 |
+
ENTRYPOINT [ "/app/main" ]
|
.editorconfig
CHANGED
@@ -17,3 +17,6 @@ indent_style = tab
|
|
17 |
|
18 |
[prompts/*.txt]
|
19 |
insert_final_newline = unset
|
|
|
|
|
|
|
|
17 |
|
18 |
[prompts/*.txt]
|
19 |
insert_final_newline = unset
|
20 |
+
|
21 |
+
[examples/server/public/*]
|
22 |
+
indent_size = 2
|
.github/workflows/code-coverage.yml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Code Coverage
|
2 |
+
on: [push, pull_request]
|
3 |
+
|
4 |
+
env:
|
5 |
+
GGML_NLOOP: 3
|
6 |
+
GGML_N_THREADS: 1
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
run:
|
10 |
+
runs-on: ubuntu-20.04
|
11 |
+
steps:
|
12 |
+
- name: Checkout
|
13 |
+
uses: actions/checkout@v3
|
14 |
+
|
15 |
+
- name: Dependencies
|
16 |
+
run: |
|
17 |
+
sudo apt-get update
|
18 |
+
sudo apt-get install build-essential gcc-8 lcov
|
19 |
+
|
20 |
+
- name: Build
|
21 |
+
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
22 |
+
|
23 |
+
- name: Run tests
|
24 |
+
run: CC=gcc-8 make test
|
25 |
+
|
26 |
+
- name: Generate coverage report
|
27 |
+
run: |
|
28 |
+
make coverage
|
29 |
+
make lcov-report
|
30 |
+
|
31 |
+
- name: Upload coverage to Codecov
|
32 |
+
uses: codecov/codecov-action@v3
|
33 |
+
env:
|
34 |
+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
35 |
+
with:
|
36 |
+
files: lcov-report/coverage.info
|
.github/workflows/gguf-publish.yml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This workflow will upload a Python Package using Twine when a GGUF release is created
|
2 |
+
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
3 |
+
|
4 |
+
# See `gguf-py/README.md` for how to make a release.
|
5 |
+
|
6 |
+
# This workflow uses actions that are not certified by GitHub.
|
7 |
+
# They are provided by a third-party and are governed by
|
8 |
+
# separate terms of service, privacy policy, and support
|
9 |
+
# documentation.
|
10 |
+
|
11 |
+
name: Upload Python Package
|
12 |
+
|
13 |
+
on:
|
14 |
+
workflow_dispatch:
|
15 |
+
push:
|
16 |
+
# Pattern matched against refs/tags
|
17 |
+
tags:
|
18 |
+
- 'gguf-v*' # Push events to every version tag
|
19 |
+
|
20 |
+
|
21 |
+
jobs:
|
22 |
+
deploy:
|
23 |
+
|
24 |
+
runs-on: ubuntu-latest
|
25 |
+
|
26 |
+
steps:
|
27 |
+
- uses: actions/checkout@v3
|
28 |
+
- name: Set up Python
|
29 |
+
uses: actions/setup-python@v2
|
30 |
+
with:
|
31 |
+
python-version: '3.9.x'
|
32 |
+
- name: Install dependencies
|
33 |
+
run: |
|
34 |
+
cd gguf-py
|
35 |
+
python -m pip install poetry
|
36 |
+
poetry install
|
37 |
+
|
38 |
+
- name: Build package
|
39 |
+
run: poetry build
|
40 |
+
- name: Publish package
|
41 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
42 |
+
with:
|
43 |
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
.gitignore
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
*.o
|
2 |
*.a
|
3 |
-
*.
|
4 |
.DS_Store
|
5 |
.build/
|
6 |
.cache/
|
@@ -12,20 +12,7 @@
|
|
12 |
.vs/
|
13 |
.vscode/
|
14 |
|
15 |
-
build
|
16 |
-
build-em/
|
17 |
-
build-debug/
|
18 |
-
build-release/
|
19 |
-
build-ci-debug/
|
20 |
-
build-ci-release/
|
21 |
-
build-static/
|
22 |
-
build-cublas/
|
23 |
-
build-opencl/
|
24 |
-
build-metal/
|
25 |
-
build-mpi/
|
26 |
-
build-no-accel/
|
27 |
-
build-sanitize-addr/
|
28 |
-
build-sanitize-thread/
|
29 |
out/
|
30 |
tmp/
|
31 |
|
@@ -39,19 +26,24 @@ models-mnt
|
|
39 |
/perplexity
|
40 |
/embedding
|
41 |
/train-text-from-scratch
|
|
|
42 |
/simple
|
43 |
/benchmark-matmult
|
44 |
/vdot
|
45 |
/server
|
46 |
/Pipfile
|
47 |
/embd-input-test
|
|
|
|
|
48 |
/libllama.so
|
49 |
-
|
|
|
50 |
arm_neon.h
|
51 |
compile_commands.json
|
52 |
CMakeSettings.json
|
53 |
|
54 |
__pycache__
|
|
|
55 |
|
56 |
dist/
|
57 |
*.spec
|
@@ -65,11 +57,11 @@ perf-*.txt
|
|
65 |
|
66 |
examples/jeopardy/results.txt
|
67 |
|
68 |
-
pyproject.toml
|
69 |
poetry.lock
|
70 |
poetry.toml
|
71 |
|
72 |
# Test binaries
|
|
|
73 |
tests/test-double-float
|
74 |
tests/test-grad0
|
75 |
tests/test-opt
|
@@ -78,16 +70,21 @@ tests/test-quantize-perf
|
|
78 |
tests/test-sampling
|
79 |
tests/test-tokenizer-0
|
80 |
|
81 |
-
|
82 |
-
koboldcpp_failsafe.so
|
83 |
-
koboldcpp_openblas.so
|
84 |
-
koboldcpp_noavx2.so
|
85 |
-
koboldcpp_clblast.so
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
*.o
|
2 |
*.a
|
3 |
+
*.bin
|
4 |
.DS_Store
|
5 |
.build/
|
6 |
.cache/
|
|
|
12 |
.vs/
|
13 |
.vscode/
|
14 |
|
15 |
+
build*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
out/
|
17 |
tmp/
|
18 |
|
|
|
26 |
/perplexity
|
27 |
/embedding
|
28 |
/train-text-from-scratch
|
29 |
+
/convert-llama2c-to-ggml
|
30 |
/simple
|
31 |
/benchmark-matmult
|
32 |
/vdot
|
33 |
/server
|
34 |
/Pipfile
|
35 |
/embd-input-test
|
36 |
+
/gguf
|
37 |
+
/gguf-llama-simple
|
38 |
/libllama.so
|
39 |
+
/llama-bench
|
40 |
+
build-info.h
|
41 |
arm_neon.h
|
42 |
compile_commands.json
|
43 |
CMakeSettings.json
|
44 |
|
45 |
__pycache__
|
46 |
+
dist
|
47 |
|
48 |
dist/
|
49 |
*.spec
|
|
|
57 |
|
58 |
examples/jeopardy/results.txt
|
59 |
|
|
|
60 |
poetry.lock
|
61 |
poetry.toml
|
62 |
|
63 |
# Test binaries
|
64 |
+
tests/test-grammar-parser
|
65 |
tests/test-double-float
|
66 |
tests/test-grad0
|
67 |
tests/test-opt
|
|
|
70 |
tests/test-sampling
|
71 |
tests/test-tokenizer-0
|
72 |
|
73 |
+
/koboldcpp_default.so
|
74 |
+
/koboldcpp_failsafe.so
|
75 |
+
/koboldcpp_openblas.so
|
76 |
+
/koboldcpp_noavx2.so
|
77 |
+
/koboldcpp_clblast.so
|
78 |
+
/koboldcpp_cublas.so
|
79 |
+
/koboldcpp_default.dll
|
80 |
+
/koboldcpp_failsafe.dll
|
81 |
+
/koboldcpp_openblas.dll
|
82 |
+
/koboldcpp_noavx2.dll
|
83 |
+
/koboldcpp_clblast.dll
|
84 |
+
/koboldcpp_cublas.dll
|
85 |
+
/cublas64_11.dll
|
86 |
+
/cublasLt64_11.dll
|
87 |
+
/rocblas/
|
88 |
+
rocblas.dll
|
89 |
+
hipblas.dll
|
90 |
+
koboldcpp_hipblas.so
|
CMakeLists.txt
CHANGED
@@ -50,6 +50,9 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA
|
|
50 |
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
51 |
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
52 |
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
|
|
|
|
|
|
53 |
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
54 |
|
55 |
|
@@ -65,6 +68,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
65 |
find_package(Threads REQUIRED)
|
66 |
|
67 |
add_compile_definitions(GGML_USE_K_QUANTS)
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
if (LLAMA_CUBLAS)
|
70 |
cmake_minimum_required(VERSION 3.17)
|
@@ -75,10 +83,6 @@ if (LLAMA_CUBLAS)
|
|
75 |
|
76 |
enable_language(CUDA)
|
77 |
|
78 |
-
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
79 |
-
set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
|
80 |
-
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
|
81 |
-
|
82 |
add_compile_definitions(GGML_USE_CUBLAS)
|
83 |
#add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas
|
84 |
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
@@ -91,6 +95,7 @@ if (LLAMA_CUBLAS)
|
|
91 |
add_compile_definitions(GGML_CUDA_F16)
|
92 |
endif()
|
93 |
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
|
|
94 |
|
95 |
if (LLAMA_STATIC)
|
96 |
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
@@ -121,6 +126,75 @@ if (LLAMA_CUBLAS)
|
|
121 |
endif()
|
122 |
endif()
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
if (LLAMA_ALL_WARNINGS)
|
125 |
if (NOT MSVC)
|
126 |
set(c_flags
|
@@ -133,15 +207,22 @@ if (LLAMA_ALL_WARNINGS)
|
|
133 |
-Wstrict-prototypes
|
134 |
-Wpointer-arith
|
135 |
-Wmissing-prototypes
|
|
|
|
|
136 |
)
|
137 |
set(cxx_flags
|
138 |
-Wall
|
139 |
-Wextra
|
140 |
-Wpedantic
|
141 |
-Wcast-qual
|
|
|
142 |
-Wno-unused-function
|
143 |
-Wno-multichar
|
144 |
)
|
|
|
|
|
|
|
|
|
145 |
else()
|
146 |
# todo : msvc
|
147 |
endif()
|
@@ -153,7 +234,7 @@ if (LLAMA_ALL_WARNINGS)
|
|
153 |
|
154 |
endif()
|
155 |
|
156 |
-
if (
|
157 |
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
158 |
|
159 |
if (BUILD_SHARED_LIBS)
|
@@ -190,7 +271,7 @@ if (NOT MSVC)
|
|
190 |
endif()
|
191 |
endif()
|
192 |
|
193 |
-
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
194 |
message(STATUS "ARM detected")
|
195 |
if (MSVC)
|
196 |
# TODO: arm msvc?
|
@@ -301,37 +382,52 @@ target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
|
301 |
set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
302 |
|
303 |
add_library(common2
|
304 |
-
|
305 |
-
|
306 |
-
|
|
|
|
|
307 |
target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
|
308 |
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
309 |
set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
310 |
|
311 |
add_library(gpttype_adapter
|
312 |
gpttype_adapter.cpp)
|
313 |
-
target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples)
|
314 |
target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
|
315 |
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
316 |
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
set_target_properties(${TARGET} PROPERTIES
|
325 |
-
set_target_properties(${TARGET} PROPERTIES
|
326 |
-
|
327 |
-
|
|
|
|
|
328 |
|
329 |
|
330 |
if (MAKE_MISC_FILES)
|
|
|
331 |
add_library(llama
|
332 |
llama.cpp
|
333 |
llama.h
|
334 |
-
llama-util.h
|
335 |
)
|
336 |
target_include_directories(llama PUBLIC .)
|
337 |
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
|
|
50 |
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
51 |
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
52 |
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
53 |
+
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
54 |
+
"llama: max. batch size for using peer access")
|
55 |
+
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
56 |
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
57 |
|
58 |
|
|
|
68 |
find_package(Threads REQUIRED)
|
69 |
|
70 |
add_compile_definitions(GGML_USE_K_QUANTS)
|
71 |
+
add_compile_definitions(LOG_DISABLE_LOGS)
|
72 |
+
|
73 |
+
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
74 |
+
set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
|
75 |
+
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
|
76 |
|
77 |
if (LLAMA_CUBLAS)
|
78 |
cmake_minimum_required(VERSION 3.17)
|
|
|
83 |
|
84 |
enable_language(CUDA)
|
85 |
|
|
|
|
|
|
|
|
|
86 |
add_compile_definitions(GGML_USE_CUBLAS)
|
87 |
#add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas
|
88 |
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
|
|
95 |
add_compile_definitions(GGML_CUDA_F16)
|
96 |
endif()
|
97 |
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
98 |
+
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
99 |
|
100 |
if (LLAMA_STATIC)
|
101 |
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
|
|
126 |
endif()
|
127 |
endif()
|
128 |
|
129 |
+
if (LLAMA_HIPBLAS)
|
130 |
+
if (MSVC)
|
131 |
+
list(APPEND CMAKE_PREFIX_PATH "C:/Program Files/AMD/ROCm/5.5")
|
132 |
+
else()
|
133 |
+
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
134 |
+
endif()
|
135 |
+
|
136 |
+
|
137 |
+
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
138 |
+
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
|
139 |
+
endif()
|
140 |
+
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
141 |
+
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
142 |
+
endif()
|
143 |
+
|
144 |
+
find_package(hip)
|
145 |
+
find_package(hipblas)
|
146 |
+
find_package(rocblas)
|
147 |
+
|
148 |
+
if (${hipblas_FOUND} AND ${hip_FOUND})
|
149 |
+
message(STATUS "HIP and hipBLAS found")
|
150 |
+
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
151 |
+
add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
|
152 |
+
if (LLAMA_CUDA_FORCE_DMMV)
|
153 |
+
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
|
154 |
+
endif()
|
155 |
+
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
156 |
+
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
157 |
+
target_compile_definitions(ggml-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
158 |
+
target_compile_definitions(ggml-rocm PUBLIC CC_TURING=1000000000)
|
159 |
+
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
160 |
+
target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
|
161 |
+
|
162 |
+
|
163 |
+
add_library(ggml-v2-rocm OBJECT ${GGML_V2_CUDA_SOURCES})
|
164 |
+
if (LLAMA_CUDA_FORCE_DMMV)
|
165 |
+
target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
|
166 |
+
endif()
|
167 |
+
target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
168 |
+
target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
169 |
+
target_compile_definitions(ggml-v2-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
170 |
+
target_compile_definitions(ggml-v2-rocm PUBLIC CC_TURING=1000000000)
|
171 |
+
set_source_files_properties(otherarch/ggml_v2-cuda.cu PROPERTIES LANGUAGE CXX)
|
172 |
+
target_link_libraries(ggml-v2-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
|
173 |
+
|
174 |
+
|
175 |
+
add_library(ggml-v2-legacy-rocm OBJECT ${GGML_V2_LEGACY_CUDA_SOURCES})
|
176 |
+
if (LLAMA_CUDA_FORCE_DMMV)
|
177 |
+
target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
|
178 |
+
endif()
|
179 |
+
target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
180 |
+
target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
181 |
+
target_compile_definitions(ggml-v2-legacy-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
182 |
+
target_compile_definitions(ggml-v2-legacy-rocm PUBLIC CC_TURING=1000000000)
|
183 |
+
set_source_files_properties(otherarch/ggml_v2-cuda-legacy.cu PROPERTIES LANGUAGE CXX)
|
184 |
+
target_link_libraries(ggml-v2-legacy-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
|
185 |
+
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
if (LLAMA_STATIC)
|
190 |
+
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
191 |
+
endif()
|
192 |
+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm ggml-v2-rocm ggml-v2-legacy-rocm)
|
193 |
+
else()
|
194 |
+
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
|
195 |
+
endif()
|
196 |
+
endif()
|
197 |
+
|
198 |
if (LLAMA_ALL_WARNINGS)
|
199 |
if (NOT MSVC)
|
200 |
set(c_flags
|
|
|
207 |
-Wstrict-prototypes
|
208 |
-Wpointer-arith
|
209 |
-Wmissing-prototypes
|
210 |
+
-Werror=implicit-int
|
211 |
+
-Wno-unused-function
|
212 |
)
|
213 |
set(cxx_flags
|
214 |
-Wall
|
215 |
-Wextra
|
216 |
-Wpedantic
|
217 |
-Wcast-qual
|
218 |
+
-Wmissing-declarations
|
219 |
-Wno-unused-function
|
220 |
-Wno-multichar
|
221 |
)
|
222 |
+
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
223 |
+
# g++ only
|
224 |
+
set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
|
225 |
+
endif()
|
226 |
else()
|
227 |
# todo : msvc
|
228 |
endif()
|
|
|
234 |
|
235 |
endif()
|
236 |
|
237 |
+
if (WIN32)
|
238 |
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
239 |
|
240 |
if (BUILD_SHARED_LIBS)
|
|
|
271 |
endif()
|
272 |
endif()
|
273 |
|
274 |
+
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64"))
|
275 |
message(STATUS "ARM detected")
|
276 |
if (MSVC)
|
277 |
# TODO: arm msvc?
|
|
|
382 |
set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
383 |
|
384 |
add_library(common2
|
385 |
+
common/common.cpp
|
386 |
+
common/common.h
|
387 |
+
common/grammar-parser.h
|
388 |
+
common/grammar-parser.cpp)
|
389 |
+
target_include_directories(common2 PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
|
390 |
target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
|
391 |
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
392 |
set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
393 |
|
394 |
add_library(gpttype_adapter
|
395 |
gpttype_adapter.cpp)
|
396 |
+
target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
|
397 |
target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
|
398 |
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
399 |
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
400 |
|
401 |
+
if (LLAMA_CUBLAS)
|
402 |
+
set(TARGET koboldcpp_cublas)
|
403 |
+
add_library(${TARGET} SHARED expose.cpp expose.h)
|
404 |
+
target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
|
405 |
+
target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
|
406 |
+
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
407 |
+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
|
408 |
+
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
409 |
+
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS})
|
410 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
411 |
+
endif()
|
412 |
|
413 |
+
if (LLAMA_HIPBLAS)
|
414 |
+
set(TARGET koboldcpp_hipblas)
|
415 |
+
add_library(${TARGET} SHARED expose.cpp expose.h)
|
416 |
+
target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
|
417 |
+
target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
|
418 |
+
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
419 |
+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
|
420 |
+
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
421 |
+
target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS})
|
422 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
423 |
+
endif()
|
424 |
|
425 |
|
426 |
if (MAKE_MISC_FILES)
|
427 |
+
add_subdirectory(common)
|
428 |
add_library(llama
|
429 |
llama.cpp
|
430 |
llama.h
|
|
|
431 |
)
|
432 |
target_include_directories(llama PUBLIC .)
|
433 |
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
Dockerfile
CHANGED
@@ -5,6 +5,7 @@ RUN apt update \
|
|
5 |
&& apt install build-essential wget libopenblas-dev make -y \
|
6 |
&& make LLAMA_OPENBLAS=1 \
|
7 |
&& wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
|
8 |
-
&& apt remove build-essential wget make -y
|
|
|
9 |
|
10 |
ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-13b-ggml-q4_0.bin", "--port", "7860"]
|
|
|
5 |
&& apt install build-essential wget libopenblas-dev make -y \
|
6 |
&& make LLAMA_OPENBLAS=1 \
|
7 |
&& wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
|
8 |
+
&& apt remove build-essential wget make -y \
|
9 |
+
&& rm -fr *.bat convert-* ci docs examples otherarchs tests
|
10 |
|
11 |
ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-13b-ggml-q4_0.bin", "--port", "7860"]
|
MIT_LICENSE_GGML_LLAMACPP_ONLY
CHANGED
@@ -23,4 +23,4 @@ SOFTWARE.
|
|
23 |
===================================
|
24 |
|
25 |
Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
|
26 |
-
Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp
|
|
|
23 |
===================================
|
24 |
|
25 |
Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
|
26 |
+
Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License
|
Makefile
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
default:
|
2 |
tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
|
3 |
dev: koboldcpp_openblas
|
4 |
dev2: koboldcpp_clblast
|
@@ -20,8 +20,6 @@ ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/nul
|
|
20 |
ARCH_ADD = -lcblas
|
21 |
endif
|
22 |
|
23 |
-
CCV := $(shell $(CC) --version | head -n 1)
|
24 |
-
CXXV := $(shell $(CXX) --version | head -n 1)
|
25 |
|
26 |
# Mac OS + Arm can report x86_64
|
27 |
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
@@ -41,8 +39,8 @@ endif
|
|
41 |
#
|
42 |
|
43 |
# keep standard at C11 and C++11
|
44 |
-
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
|
45 |
-
CXXFLAGS = -I. -I./
|
46 |
LDFLAGS =
|
47 |
|
48 |
# these are used on windows, to build some libraries with extra old device compatibility
|
@@ -110,7 +108,8 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
|
110 |
# old library NEEDS mf16c to work. so we must build with it. new one doesnt
|
111 |
ifeq ($(OS),Windows_NT)
|
112 |
CFLAGS +=
|
113 |
-
NONECFLAGS +=
|
|
|
114 |
SIMPLECFLAGS += -mavx -msse3
|
115 |
FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
|
116 |
else
|
@@ -195,6 +194,42 @@ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-l
|
|
195 |
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
196 |
endif # LLAMA_CUBLAS
|
197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
ifdef LLAMA_METAL
|
199 |
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
200 |
CXXFLAGS += -DGGML_USE_METAL
|
@@ -224,12 +259,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
|
224 |
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
225 |
endif
|
226 |
|
|
|
|
|
|
|
227 |
DEFAULT_BUILD =
|
228 |
FAILSAFE_BUILD =
|
229 |
OPENBLAS_BUILD =
|
230 |
NOAVX2_BUILD =
|
231 |
CLBLAST_BUILD =
|
232 |
CUBLAS_BUILD =
|
|
|
233 |
|
234 |
ifeq ($(OS),Windows_NT)
|
235 |
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
|
@@ -238,10 +277,12 @@ ifeq ($(OS),Windows_NT)
|
|
238 |
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
|
239 |
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS)
|
240 |
|
241 |
-
ifdef LLAMA_CUBLAS
|
242 |
-
|
243 |
-
endif
|
244 |
-
|
|
|
|
|
245 |
else
|
246 |
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
|
247 |
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
|
@@ -250,24 +291,29 @@ else
|
|
250 |
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
|
251 |
endif
|
252 |
ifdef LLAMA_CLBLAST
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
endif
|
259 |
|
260 |
-
ifdef LLAMA_CUBLAS
|
261 |
-
|
262 |
-
endif
|
|
|
|
|
|
|
263 |
|
264 |
ifndef LLAMA_OPENBLAS
|
265 |
ifndef LLAMA_CLBLAST
|
266 |
ifndef LLAMA_CUBLAS
|
|
|
267 |
OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
|
268 |
endif
|
269 |
endif
|
270 |
endif
|
|
|
271 |
endif
|
272 |
|
273 |
|
@@ -293,16 +339,16 @@ $(info )
|
|
293 |
|
294 |
ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h
|
295 |
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
296 |
-
ggml_openblas.o: ggml.c ggml.h
|
297 |
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
298 |
-
ggml_failsafe.o: ggml.c ggml.h
|
299 |
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
300 |
-
ggml_noavx2.o: ggml.c ggml.h
|
301 |
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
302 |
-
ggml_clblast.o: ggml.c ggml.h
|
303 |
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
304 |
-
ggml_cublas.o: ggml.c ggml.h
|
305 |
-
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
|
306 |
|
307 |
#quants K
|
308 |
k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
|
@@ -328,7 +374,7 @@ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
|
328 |
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
329 |
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
330 |
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
331 |
-
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
|
332 |
|
333 |
#extreme old version compat
|
334 |
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
@@ -345,19 +391,19 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
|
|
345 |
$(CC) $(CFLAGS) -c $< -o $@
|
346 |
|
347 |
# intermediate objects
|
348 |
-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
349 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
350 |
-
common.o:
|
351 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
352 |
-
console.o:
|
353 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
354 |
-
grammar-parser.o:
|
355 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
356 |
expose.o: expose.cpp expose.h
|
357 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
358 |
|
359 |
# idiotic "for easier compilation"
|
360 |
-
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
361 |
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
|
362 |
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
|
363 |
gpttype_adapter.o: $(GPTTYPE_ADAPTER)
|
@@ -365,10 +411,10 @@ gpttype_adapter.o: $(GPTTYPE_ADAPTER)
|
|
365 |
gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
|
366 |
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
367 |
gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
|
368 |
-
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
|
369 |
|
370 |
clean:
|
371 |
-
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe
|
372 |
|
373 |
main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
374 |
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
@@ -376,19 +422,24 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o
|
|
376 |
@echo '==== Run ./main -h for help. ===='
|
377 |
@echo
|
378 |
|
|
|
|
|
|
|
379 |
#generated libraries
|
380 |
-
|
381 |
$(DEFAULT_BUILD)
|
382 |
-
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS)
|
383 |
$(OPENBLAS_BUILD)
|
384 |
-
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o $(OBJS)
|
385 |
$(FAILSAFE_BUILD)
|
386 |
-
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o $(OBJS)
|
387 |
$(NOAVX2_BUILD)
|
388 |
-
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS)
|
389 |
$(CLBLAST_BUILD)
|
390 |
-
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(OBJS)
|
391 |
$(CUBLAS_BUILD)
|
|
|
|
|
392 |
|
393 |
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
|
394 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
|
|
1 |
+
default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas koboldcpp_hipblas
|
2 |
tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
|
3 |
dev: koboldcpp_openblas
|
4 |
dev2: koboldcpp_clblast
|
|
|
20 |
ARCH_ADD = -lcblas
|
21 |
endif
|
22 |
|
|
|
|
|
23 |
|
24 |
# Mac OS + Arm can report x86_64
|
25 |
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
|
|
39 |
#
|
40 |
|
41 |
# keep standard at C11 and C++11
|
42 |
+
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
43 |
+
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
44 |
LDFLAGS =
|
45 |
|
46 |
# these are used on windows, to build some libraries with extra old device compatibility
|
|
|
108 |
# old library NEEDS mf16c to work. so we must build with it. new one doesnt
|
109 |
ifeq ($(OS),Windows_NT)
|
110 |
CFLAGS +=
|
111 |
+
NONECFLAGS +=
|
112 |
+
# -mno-sse3
|
113 |
SIMPLECFLAGS += -mavx -msse3
|
114 |
FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
|
115 |
else
|
|
|
194 |
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
195 |
endif # LLAMA_CUBLAS
|
196 |
|
197 |
+
ifdef LLAMA_HIPBLAS
|
198 |
+
ROCM_PATH ?= /opt/rocm
|
199 |
+
CC := $(ROCM_PATH)/llvm/bin/clang
|
200 |
+
CXX := $(ROCM_PATH)/llvm/bin/clang++
|
201 |
+
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
202 |
+
LLAMA_CUDA_DMMV_X ?= 32
|
203 |
+
LLAMA_CUDA_MMV_Y ?= 2
|
204 |
+
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
205 |
+
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
|
206 |
+
ifdef LLAMA_CUDA_FORCE_DMMV
|
207 |
+
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
208 |
+
endif # LLAMA_CUDA_FORCE_DMMV
|
209 |
+
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
|
210 |
+
HIP_OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
|
211 |
+
ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
|
212 |
+
-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
|
213 |
+
-DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
|
214 |
+
-DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
215 |
+
ggml_v2-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
|
216 |
+
-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
|
217 |
+
-DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
|
218 |
+
-DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
219 |
+
ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
|
220 |
+
-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
|
221 |
+
-DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
|
222 |
+
-DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
223 |
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
224 |
+
$(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
225 |
+
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
226 |
+
$(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
227 |
+
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
|
228 |
+
$(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
229 |
+
endif # LLAMA_HIPBLAS
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
ifdef LLAMA_METAL
|
234 |
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
235 |
CXXFLAGS += -DGGML_USE_METAL
|
|
|
259 |
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
260 |
endif
|
261 |
|
262 |
+
CCV := $(shell $(CC) --version | head -n 1)
|
263 |
+
CXXV := $(shell $(CXX) --version | head -n 1)
|
264 |
+
|
265 |
DEFAULT_BUILD =
|
266 |
FAILSAFE_BUILD =
|
267 |
OPENBLAS_BUILD =
|
268 |
NOAVX2_BUILD =
|
269 |
CLBLAST_BUILD =
|
270 |
CUBLAS_BUILD =
|
271 |
+
HIPBLAS_BUILD =
|
272 |
|
273 |
ifeq ($(OS),Windows_NT)
|
274 |
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
|
|
|
277 |
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
|
278 |
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS)
|
279 |
|
280 |
+
ifdef LLAMA_CUBLAS
|
281 |
+
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS)
|
282 |
+
endif
|
283 |
+
ifdef LLAMA_HIPBLAS
|
284 |
+
HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.dll $(HIPLDFLAGS) $(LDFLAGS)
|
285 |
+
endif
|
286 |
else
|
287 |
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
|
288 |
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
|
|
|
291 |
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
|
292 |
endif
|
293 |
ifdef LLAMA_CLBLAST
|
294 |
+
ifeq ($(UNAME_S),Darwin)
|
295 |
+
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
|
296 |
+
else
|
297 |
+
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
|
298 |
+
endif
|
299 |
endif
|
300 |
|
301 |
+
ifdef LLAMA_CUBLAS
|
302 |
+
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS)
|
303 |
+
endif
|
304 |
+
ifdef LLAMA_HIPBLAS
|
305 |
+
HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.so $(HIPLDFLAGS) $(LDFLAGS)
|
306 |
+
endif
|
307 |
|
308 |
ifndef LLAMA_OPENBLAS
|
309 |
ifndef LLAMA_CLBLAST
|
310 |
ifndef LLAMA_CUBLAS
|
311 |
+
ifndef LLAMA_HIPBLAS
|
312 |
OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
|
313 |
endif
|
314 |
endif
|
315 |
endif
|
316 |
+
endif
|
317 |
endif
|
318 |
|
319 |
|
|
|
339 |
|
340 |
ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h
|
341 |
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
342 |
+
ggml_openblas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
|
343 |
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
344 |
+
ggml_failsafe.o: ggml.c ggml.h ggml-cuda.h k_quants.h
|
345 |
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
346 |
+
ggml_noavx2.o: ggml.c ggml.h ggml-cuda.h k_quants.h
|
347 |
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
348 |
+
ggml_clblast.o: ggml.c ggml.h ggml-cuda.h k_quants.h
|
349 |
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
350 |
+
ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
|
351 |
+
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
352 |
|
353 |
#quants K
|
354 |
k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
|
|
|
374 |
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
375 |
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
376 |
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
377 |
+
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
378 |
|
379 |
#extreme old version compat
|
380 |
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
|
|
391 |
$(CC) $(CFLAGS) -c $< -o $@
|
392 |
|
393 |
# intermediate objects
|
394 |
+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
|
395 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
396 |
+
common.o: common/common.cpp common/common.h common/log.h
|
397 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
398 |
+
console.o: common/console.cpp common/console.h
|
399 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
400 |
+
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
|
401 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
402 |
expose.o: expose.cpp expose.h
|
403 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
404 |
|
405 |
# idiotic "for easier compilation"
|
406 |
+
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h otherarch/llama-util.h
|
407 |
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
|
408 |
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
|
409 |
gpttype_adapter.o: $(GPTTYPE_ADAPTER)
|
|
|
411 |
gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
|
412 |
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
413 |
gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
|
414 |
+
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
415 |
|
416 |
clean:
|
417 |
+
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
|
418 |
|
419 |
main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
420 |
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
|
422 |
@echo '==== Run ./main -h for help. ===='
|
423 |
@echo
|
424 |
|
425 |
+
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
426 |
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
427 |
+
|
428 |
#generated libraries
|
429 |
+
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
|
430 |
$(DEFAULT_BUILD)
|
431 |
+
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
|
432 |
$(OPENBLAS_BUILD)
|
433 |
+
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS)
|
434 |
$(FAILSAFE_BUILD)
|
435 |
+
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS)
|
436 |
$(NOAVX2_BUILD)
|
437 |
+
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
|
438 |
$(CLBLAST_BUILD)
|
439 |
+
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
|
440 |
$(CUBLAS_BUILD)
|
441 |
+
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
|
442 |
+
$(HIPBLAS_BUILD)
|
443 |
|
444 |
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
|
445 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
Package.swift
CHANGED
@@ -2,8 +2,30 @@
|
|
2 |
|
3 |
import PackageDescription
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
let package = Package(
|
6 |
name: "llama",
|
|
|
7 |
products: [
|
8 |
.library(name: "llama", targets: ["llama"]),
|
9 |
],
|
@@ -11,14 +33,23 @@ let package = Package(
|
|
11 |
.target(
|
12 |
name: "llama",
|
13 |
path: ".",
|
14 |
-
exclude:
|
15 |
-
sources: [
|
|
|
|
|
|
|
|
|
|
|
16 |
publicHeadersPath: "spm-headers",
|
17 |
-
cSettings: [
|
|
|
|
|
|
|
|
|
18 |
linkerSettings: [
|
19 |
.linkedFramework("Accelerate")
|
20 |
]
|
21 |
-
)
|
22 |
],
|
23 |
cxxLanguageStandard: .cxx11
|
24 |
)
|
|
|
2 |
|
3 |
import PackageDescription
|
4 |
|
5 |
+
#if arch(arm) || arch(arm64)
|
6 |
+
let platforms: [SupportedPlatform]? = [
|
7 |
+
.macOS(.v11),
|
8 |
+
.iOS(.v14),
|
9 |
+
.watchOS(.v4),
|
10 |
+
.tvOS(.v14)
|
11 |
+
]
|
12 |
+
let exclude: [String] = []
|
13 |
+
let additionalSources: [String] = ["ggml-metal.m"]
|
14 |
+
let additionalSettings: [CSetting] = [
|
15 |
+
.unsafeFlags(["-fno-objc-arc"]),
|
16 |
+
.define("GGML_SWIFT"),
|
17 |
+
.define("GGML_USE_METAL")
|
18 |
+
]
|
19 |
+
#else
|
20 |
+
let platforms: [SupportedPlatform]? = nil
|
21 |
+
let exclude: [String] = ["ggml-metal.metal"]
|
22 |
+
let additionalSources: [String] = []
|
23 |
+
let additionalSettings: [CSetting] = []
|
24 |
+
#endif
|
25 |
+
|
26 |
let package = Package(
|
27 |
name: "llama",
|
28 |
+
platforms: platforms,
|
29 |
products: [
|
30 |
.library(name: "llama", targets: ["llama"]),
|
31 |
],
|
|
|
33 |
.target(
|
34 |
name: "llama",
|
35 |
path: ".",
|
36 |
+
exclude: exclude,
|
37 |
+
sources: [
|
38 |
+
"ggml.c",
|
39 |
+
"llama.cpp",
|
40 |
+
"ggml-alloc.c",
|
41 |
+
"k_quants.c",
|
42 |
+
] + additionalSources,
|
43 |
publicHeadersPath: "spm-headers",
|
44 |
+
cSettings: [
|
45 |
+
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
46 |
+
.define("GGML_USE_K_QUANTS"),
|
47 |
+
.define("GGML_USE_ACCELERATE")
|
48 |
+
] + additionalSettings,
|
49 |
linkerSettings: [
|
50 |
.linkedFramework("Accelerate")
|
51 |
]
|
52 |
+
)
|
53 |
],
|
54 |
cxxLanguageStandard: .cxx11
|
55 |
)
|
README.md
CHANGED
@@ -3,4 +3,4 @@ sdk: docker
|
|
3 |
emoji: 🚀
|
4 |
colorFrom: yellow
|
5 |
colorTo: blue
|
6 |
-
---
|
|
|
3 |
emoji: 🚀
|
4 |
colorFrom: yellow
|
5 |
colorTo: blue
|
6 |
+
---
|
build-info.h
CHANGED
@@ -3,5 +3,7 @@
|
|
3 |
|
4 |
#define BUILD_NUMBER 999
|
5 |
#define BUILD_COMMIT "KOBOLDCPP"
|
|
|
|
|
6 |
|
7 |
#endif // BUILD_INFO_H
|
|
|
3 |
|
4 |
#define BUILD_NUMBER 999
|
5 |
#define BUILD_COMMIT "KOBOLDCPP"
|
6 |
+
#define BUILD_COMPILER "KCPP"
|
7 |
+
#define BUILD_TARGET "KCPP"
|
8 |
|
9 |
#endif // BUILD_INFO_H
|
ci/run.sh
CHANGED
@@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
|
|
159 |
|
160 |
python3 ../convert.py ${path_models}
|
161 |
|
162 |
-
model_f16="${path_models}/ggml-model-f16.
|
163 |
-
model_q8_0="${path_models}/ggml-model-q8_0.
|
164 |
-
model_q4_0="${path_models}/ggml-model-q4_0.
|
165 |
-
model_q4_1="${path_models}/ggml-model-q4_1.
|
166 |
-
model_q5_0="${path_models}/ggml-model-q5_0.
|
167 |
-
model_q5_1="${path_models}/ggml-model-q5_1.
|
168 |
-
model_q2_k="${path_models}/ggml-model-q2_k.
|
169 |
-
model_q3_k="${path_models}/ggml-model-q3_k.
|
170 |
-
model_q4_k="${path_models}/ggml-model-q4_k.
|
171 |
-
model_q5_k="${path_models}/ggml-model-q5_k.
|
172 |
-
model_q6_k="${path_models}/ggml-model-q6_k.
|
173 |
|
174 |
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
175 |
|
@@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 {
|
|
196 |
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
197 |
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
198 |
|
199 |
-
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
200 |
-
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
201 |
-
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
202 |
-
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
203 |
-
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
204 |
-
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
205 |
-
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
206 |
-
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
207 |
-
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
208 |
-
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
209 |
-
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks
|
210 |
|
211 |
function check_ppl {
|
212 |
qnt="$1"
|
@@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 {
|
|
233 |
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
234 |
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
set +e
|
237 |
}
|
238 |
|
@@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 {
|
|
242 |
gg_printf 'OpenLLaMA 3B-v2:\n'
|
243 |
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
244 |
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
|
245 |
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
246 |
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
247 |
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
@@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 {
|
|
253 |
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
254 |
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
255 |
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
|
|
|
|
|
|
|
|
|
256 |
}
|
257 |
|
258 |
# open_llama_7b_v2
|
@@ -285,17 +333,17 @@ function gg_run_open_llama_7b_v2 {
|
|
285 |
|
286 |
python3 ../convert.py ${path_models}
|
287 |
|
288 |
-
model_f16="${path_models}/ggml-model-f16.
|
289 |
-
model_q8_0="${path_models}/ggml-model-q8_0.
|
290 |
-
model_q4_0="${path_models}/ggml-model-q4_0.
|
291 |
-
model_q4_1="${path_models}/ggml-model-q4_1.
|
292 |
-
model_q5_0="${path_models}/ggml-model-q5_0.
|
293 |
-
model_q5_1="${path_models}/ggml-model-q5_1.
|
294 |
-
model_q2_k="${path_models}/ggml-model-q2_k.
|
295 |
-
model_q3_k="${path_models}/ggml-model-q3_k.
|
296 |
-
model_q4_k="${path_models}/ggml-model-q4_k.
|
297 |
-
model_q5_k="${path_models}/ggml-model-q5_k.
|
298 |
-
model_q6_k="${path_models}/ggml-model-q6_k.
|
299 |
|
300 |
wiki_test="${path_wiki}/wiki.test.raw"
|
301 |
|
@@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 {
|
|
310 |
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
311 |
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
312 |
|
313 |
-
(time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
314 |
-
(time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
315 |
-
(time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
316 |
-
(time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
317 |
-
(time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
318 |
-
(time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
319 |
-
(time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
320 |
-
(time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
321 |
-
(time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
322 |
-
(time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
323 |
-
(time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
324 |
|
325 |
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
326 |
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
@@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 {
|
|
359 |
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
360 |
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
361 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
set +e
|
363 |
}
|
364 |
|
@@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 {
|
|
368 |
gg_printf 'OpenLLaMA 7B-v2:\n'
|
369 |
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
370 |
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
|
371 |
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
372 |
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
373 |
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
@@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 {
|
|
379 |
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
380 |
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
381 |
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
|
|
|
|
|
|
|
|
|
382 |
}
|
383 |
|
384 |
## main
|
@@ -391,6 +487,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|
391 |
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
392 |
|
393 |
python3 -m pip install -r ${SRC}/requirements.txt
|
|
|
394 |
fi
|
395 |
|
396 |
ret=0
|
|
|
159 |
|
160 |
python3 ../convert.py ${path_models}
|
161 |
|
162 |
+
model_f16="${path_models}/ggml-model-f16.gguf"
|
163 |
+
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
164 |
+
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
165 |
+
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
166 |
+
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
167 |
+
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
168 |
+
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
169 |
+
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
170 |
+
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
171 |
+
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
172 |
+
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
173 |
|
174 |
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
175 |
|
|
|
196 |
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
197 |
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
198 |
|
199 |
+
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
200 |
+
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
201 |
+
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
202 |
+
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
203 |
+
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
204 |
+
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
205 |
+
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
206 |
+
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
207 |
+
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
208 |
+
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
209 |
+
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
210 |
|
211 |
function check_ppl {
|
212 |
qnt="$1"
|
|
|
233 |
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
234 |
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
235 |
|
236 |
+
# lora
|
237 |
+
function compare_ppl {
|
238 |
+
qnt="$1"
|
239 |
+
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
240 |
+
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
241 |
+
|
242 |
+
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
243 |
+
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
244 |
+
return 20
|
245 |
+
fi
|
246 |
+
|
247 |
+
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
248 |
+
return 0
|
249 |
+
}
|
250 |
+
|
251 |
+
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
252 |
+
path_shakespeare="../models-mnt/shakespeare"
|
253 |
+
|
254 |
+
shakespeare="${path_shakespeare}/shakespeare.txt"
|
255 |
+
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
256 |
+
|
257 |
+
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
258 |
+
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
259 |
+
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
260 |
+
|
261 |
+
python3 ../convert-lora-to-ggml.py ${path_lora}
|
262 |
+
|
263 |
+
# f16
|
264 |
+
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
265 |
+
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
266 |
+
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
267 |
+
|
268 |
+
# q8_0
|
269 |
+
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
270 |
+
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
271 |
+
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
272 |
+
|
273 |
+
# q8_0 + f16 lora-base
|
274 |
+
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
275 |
+
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
276 |
+
|
277 |
+
|
278 |
set +e
|
279 |
}
|
280 |
|
|
|
284 |
gg_printf 'OpenLLaMA 3B-v2:\n'
|
285 |
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
286 |
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
287 |
+
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
288 |
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
289 |
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
290 |
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
|
296 |
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
297 |
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
298 |
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
299 |
+
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
300 |
+
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
301 |
+
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
302 |
+
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
303 |
+
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
304 |
}
|
305 |
|
306 |
# open_llama_7b_v2
|
|
|
333 |
|
334 |
python3 ../convert.py ${path_models}
|
335 |
|
336 |
+
model_f16="${path_models}/ggml-model-f16.gguf"
|
337 |
+
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
338 |
+
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
339 |
+
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
340 |
+
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
341 |
+
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
342 |
+
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
343 |
+
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
344 |
+
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
345 |
+
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
346 |
+
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
347 |
|
348 |
wiki_test="${path_wiki}/wiki.test.raw"
|
349 |
|
|
|
358 |
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
359 |
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
360 |
|
361 |
+
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
362 |
+
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
363 |
+
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
364 |
+
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
365 |
+
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
366 |
+
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
367 |
+
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
368 |
+
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
369 |
+
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
370 |
+
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
371 |
+
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
372 |
|
373 |
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
374 |
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
|
407 |
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
408 |
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
409 |
|
410 |
+
# lora
|
411 |
+
function compare_ppl {
|
412 |
+
qnt="$1"
|
413 |
+
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
414 |
+
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
415 |
+
|
416 |
+
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
417 |
+
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
418 |
+
return 20
|
419 |
+
fi
|
420 |
+
|
421 |
+
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
422 |
+
return 0
|
423 |
+
}
|
424 |
+
|
425 |
+
path_lora="../models-mnt/open-llama/7B-v2/lora"
|
426 |
+
path_shakespeare="../models-mnt/shakespeare"
|
427 |
+
|
428 |
+
shakespeare="${path_shakespeare}/shakespeare.txt"
|
429 |
+
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
430 |
+
|
431 |
+
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
432 |
+
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
433 |
+
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
434 |
+
|
435 |
+
python3 ../convert-lora-to-ggml.py ${path_lora}
|
436 |
+
|
437 |
+
# f16
|
438 |
+
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
439 |
+
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
440 |
+
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
441 |
+
|
442 |
+
# currently not supported by the CUDA backend
|
443 |
+
# q8_0
|
444 |
+
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
445 |
+
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
446 |
+
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
447 |
+
|
448 |
+
# q8_0 + f16 lora-base
|
449 |
+
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
450 |
+
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
451 |
+
|
452 |
set +e
|
453 |
}
|
454 |
|
|
|
458 |
gg_printf 'OpenLLaMA 7B-v2:\n'
|
459 |
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
460 |
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
461 |
+
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
462 |
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
463 |
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
464 |
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
|
470 |
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
471 |
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
472 |
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
473 |
+
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
474 |
+
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
475 |
+
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
476 |
+
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
477 |
+
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
478 |
}
|
479 |
|
480 |
## main
|
|
|
487 |
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
488 |
|
489 |
python3 -m pip install -r ${SRC}/requirements.txt
|
490 |
+
python3 -m pip install --editable gguf-py
|
491 |
fi
|
492 |
|
493 |
ret=0
|
class.py
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## KoboldCpp based GGML Backend by Concedo
|
2 |
+
## For use as a custom backend in KoboldAI United
|
3 |
+
## Not intended for general use.
|
4 |
+
|
5 |
+
from __future__ import annotations
|
6 |
+
|
7 |
+
import time, json
|
8 |
+
import torch
|
9 |
+
import requests
|
10 |
+
import numpy as np
|
11 |
+
from typing import List, Optional, Union
|
12 |
+
import os
|
13 |
+
from . import koboldcpp
|
14 |
+
|
15 |
+
import utils
|
16 |
+
from logger import logger
|
17 |
+
from modeling.inference_model import (
|
18 |
+
GenerationResult,
|
19 |
+
GenerationSettings,
|
20 |
+
InferenceModel,
|
21 |
+
)
|
22 |
+
|
23 |
+
model_backend_name = "koboldcpp" #specific instead of ggml
|
24 |
+
model_backend_type = "ggml" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
|
25 |
+
|
26 |
+
kcpp_backend_loaded = False
|
27 |
+
|
28 |
+
class KoboldCppException(Exception):
|
29 |
+
"""To be used for errors on cpp side of KoboldCpp."""
|
30 |
+
|
31 |
+
class KcppArgsObject:
|
32 |
+
def __init__(self, **kwargs):
|
33 |
+
self.__dict__.update(kwargs)
|
34 |
+
|
35 |
+
class model_backend(InferenceModel):
|
36 |
+
def __init__(self) -> None:
|
37 |
+
super().__init__()
|
38 |
+
|
39 |
+
def is_valid(self, model_name, model_path, menu_path):
|
40 |
+
|
41 |
+
foundfile = False
|
42 |
+
try:
|
43 |
+
files = os.listdir(model_path)
|
44 |
+
foundfile = len([filename for filename in files if (("ggml" in filename.lower() and ".bin" in filename.lower()) or ".gguf" in filename.lower())])>0
|
45 |
+
except:
|
46 |
+
pass
|
47 |
+
return foundfile
|
48 |
+
|
49 |
+
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
50 |
+
|
51 |
+
self.kcpp_threads = 5
|
52 |
+
self.model_name = "GGML_Model"
|
53 |
+
self.kcpp_ctxsize = 2048
|
54 |
+
self.kcpp_blasbatchsize = 512
|
55 |
+
self.kcpp_gpulayers = 0
|
56 |
+
self.kcpp_smartcontext = False
|
57 |
+
self.kcpp_ropescale = 0.0
|
58 |
+
self.kcpp_ropebase = 10000.0
|
59 |
+
self.kcpp_useclblast = None
|
60 |
+
self.kcpp_usecublas = None
|
61 |
+
self.kcpp_noblas = False
|
62 |
+
self.kcpp_noavx2 = False
|
63 |
+
self.kcpp_nommap = False
|
64 |
+
self.kcpp_debugmode = 0
|
65 |
+
self.kcpp_tensor_split_str = ""
|
66 |
+
self.kcpp_tensor_split = None
|
67 |
+
|
68 |
+
files = os.listdir(model_path)
|
69 |
+
foundfiles = [filename for filename in files if (("ggml" in filename.lower() and ".bin" in filename.lower()) or ".gguf" in filename.lower())]
|
70 |
+
|
71 |
+
requested_parameters = []
|
72 |
+
foldermdls = []
|
73 |
+
for ff in foundfiles:
|
74 |
+
foldermdls.append({'text': ff, 'value': os.path.join(model_path, ff)})
|
75 |
+
requested_parameters.append({
|
76 |
+
"uitype": "dropdown",
|
77 |
+
"unit": "string",
|
78 |
+
"label": "GGML DataFile Name",
|
79 |
+
"id": "kcpp_filename",
|
80 |
+
"default": os.path.join(model_path, foundfiles[0]) if len(foundfiles)>0 else model_name,
|
81 |
+
"check": {"value": "", 'check': "!="},
|
82 |
+
"tooltip": "Actual GGML DataFile Name",
|
83 |
+
"menu_path": "",
|
84 |
+
"refresh_model_inputs": False,
|
85 |
+
"extra_classes": "",
|
86 |
+
'children': foldermdls
|
87 |
+
})
|
88 |
+
requested_parameters.append({
|
89 |
+
"uitype": "dropdown",
|
90 |
+
"unit": "int",
|
91 |
+
"label": "KoboldCpp Accelerator",
|
92 |
+
"id": "kcpp_accelerator",
|
93 |
+
"default": 0,
|
94 |
+
"check": {"value": "", 'check': "!="},
|
95 |
+
'multiple': False,
|
96 |
+
"tooltip": "KoboldCpp Accelerator",
|
97 |
+
"menu_path": "",
|
98 |
+
"refresh_model_inputs": False,
|
99 |
+
"extra_classes": "",
|
100 |
+
'children': [{'text': 'Use No BLAS', 'value': 0}, {'text': 'Use OpenBLAS', 'value': 1}, {'text': 'Use CuBLAS', 'value': 2},
|
101 |
+
{'text': 'Use CLBLast GPU #1', 'value': 3},{'text': 'Use CLBLast GPU #2', 'value': 4},{'text': 'Use CLBLast GPU #3', 'value': 5}
|
102 |
+
,{'text': 'NoAVX2 Mode (Old CPU)', 'value': 6},{'text': 'Failsafe Mode (Old CPU)', 'value': 7}],
|
103 |
+
})
|
104 |
+
requested_parameters.append({
|
105 |
+
"uitype": "text",
|
106 |
+
"unit": "int",
|
107 |
+
"label": "Threads",
|
108 |
+
"id": "kcpp_threads",
|
109 |
+
"default": self.kcpp_threads,
|
110 |
+
"check": {"value": "", 'check': "!="},
|
111 |
+
"tooltip": "Thread Count",
|
112 |
+
"menu_path": "",
|
113 |
+
"refresh_model_inputs": False,
|
114 |
+
"extra_classes": ""
|
115 |
+
})
|
116 |
+
|
117 |
+
requested_parameters.append({
|
118 |
+
"uitype": "text",
|
119 |
+
"unit": "int",
|
120 |
+
"label": "Max Context Size",
|
121 |
+
"id": "kcpp_ctxsize",
|
122 |
+
"default": self.kcpp_ctxsize,
|
123 |
+
"check": {"value": "", 'check': "!="},
|
124 |
+
"tooltip": "Max Context Size",
|
125 |
+
"menu_path": "",
|
126 |
+
"refresh_model_inputs": False,
|
127 |
+
"extra_classes": ""
|
128 |
+
})
|
129 |
+
requested_parameters.append({
|
130 |
+
"uitype": "text",
|
131 |
+
"unit": "int",
|
132 |
+
"label": "BLAS Batch Size",
|
133 |
+
"id": "kcpp_blasbatchsize",
|
134 |
+
"default": self.kcpp_blasbatchsize,
|
135 |
+
"check": {"value": "", 'check': "!="},
|
136 |
+
"tooltip": "BLAS Batch Size",
|
137 |
+
"menu_path": "",
|
138 |
+
"refresh_model_inputs": False,
|
139 |
+
"extra_classes": ""
|
140 |
+
})
|
141 |
+
requested_parameters.append({
|
142 |
+
"uitype": "text",
|
143 |
+
"unit": "int",
|
144 |
+
"label": "GPU Layers",
|
145 |
+
"id": "kcpp_gpulayers",
|
146 |
+
"default": self.kcpp_gpulayers,
|
147 |
+
"check": {"value": "", 'check': "!="},
|
148 |
+
"tooltip": "GPU Layers",
|
149 |
+
"menu_path": "",
|
150 |
+
"refresh_model_inputs": False,
|
151 |
+
"extra_classes": ""
|
152 |
+
})
|
153 |
+
requested_parameters.append({
|
154 |
+
"uitype": "text",
|
155 |
+
"unit": "int",
|
156 |
+
"label": "Rope Scale",
|
157 |
+
"id": "kcpp_ropescale",
|
158 |
+
"default": self.kcpp_ropescale,
|
159 |
+
"check": {"value": "", 'check': "!="},
|
160 |
+
"tooltip": "Rope Scale",
|
161 |
+
"menu_path": "",
|
162 |
+
"refresh_model_inputs": False,
|
163 |
+
"extra_classes": ""
|
164 |
+
})
|
165 |
+
requested_parameters.append({
|
166 |
+
"uitype": "text",
|
167 |
+
"unit": "int",
|
168 |
+
"label": "Rope Base",
|
169 |
+
"id": "kcpp_ropebase",
|
170 |
+
"default": self.kcpp_ropebase,
|
171 |
+
"check": {"value": "", 'check': "!="},
|
172 |
+
"tooltip": "Rope Base",
|
173 |
+
"menu_path": "",
|
174 |
+
"refresh_model_inputs": False,
|
175 |
+
"extra_classes": ""
|
176 |
+
})
|
177 |
+
requested_parameters.append({
|
178 |
+
"uitype": "dropdown",
|
179 |
+
"unit": "int",
|
180 |
+
"label": "Smart Context",
|
181 |
+
"id": "kcpp_smartcontext",
|
182 |
+
"default": self.kcpp_smartcontext,
|
183 |
+
"check": {"value": "", 'check': "!="},
|
184 |
+
'multiple': False,
|
185 |
+
"tooltip": "Smart Context",
|
186 |
+
"menu_path": "",
|
187 |
+
"refresh_model_inputs": False,
|
188 |
+
"extra_classes": "",
|
189 |
+
'children': [{'text': 'False', 'value': False}, {'text': 'True', 'value': True}],
|
190 |
+
})
|
191 |
+
requested_parameters.append({
|
192 |
+
"uitype": "dropdown",
|
193 |
+
"unit": "int",
|
194 |
+
"label": "Debug Mode",
|
195 |
+
"id": "kcpp_debugmode",
|
196 |
+
"default": self.kcpp_debugmode,
|
197 |
+
"check": {"value": "", 'check': "!="},
|
198 |
+
'multiple': False,
|
199 |
+
"tooltip": "Debug Mode",
|
200 |
+
"menu_path": "",
|
201 |
+
"refresh_model_inputs": False,
|
202 |
+
"extra_classes": "",
|
203 |
+
'children': [{'text': 'False', 'value': 0}, {'text': 'True', 'value': 1}],
|
204 |
+
})
|
205 |
+
requested_parameters.append({
|
206 |
+
"uitype": "text",
|
207 |
+
"unit": "text",
|
208 |
+
"label": "Tensor Split",
|
209 |
+
"id": "kcpp_tensor_split_str",
|
210 |
+
"default": self.kcpp_tensor_split_str,
|
211 |
+
"check": {"value": "", 'check': "!="},
|
212 |
+
"tooltip": "Tensor Split, values are space separated",
|
213 |
+
"menu_path": "",
|
214 |
+
"refresh_model_inputs": False,
|
215 |
+
"extra_classes": ""
|
216 |
+
})
|
217 |
+
return requested_parameters
|
218 |
+
|
219 |
+
def set_input_parameters(self, parameters):
|
220 |
+
self.kcpp_threads = parameters["kcpp_threads"]
|
221 |
+
self.kcpp_filename = parameters["kcpp_filename"]
|
222 |
+
self.kcpp_ctxsize = parameters["kcpp_ctxsize"]
|
223 |
+
self.kcpp_blasbatchsize = parameters["kcpp_blasbatchsize"]
|
224 |
+
self.kcpp_gpulayers = parameters["kcpp_gpulayers"]
|
225 |
+
self.kcpp_smartcontext = parameters["kcpp_smartcontext"]
|
226 |
+
self.kcpp_ropescale = parameters["kcpp_ropescale"]
|
227 |
+
self.kcpp_ropebase = parameters["kcpp_ropebase"]
|
228 |
+
self.kcpp_debugmode = parameters["kcpp_debugmode"]
|
229 |
+
self.kcpp_tensor_split_str = parameters["kcpp_tensor_split_str"]
|
230 |
+
if self.kcpp_tensor_split_str and self.kcpp_tensor_split_str!="":
|
231 |
+
splits = self.kcpp_tensor_split_str.split()
|
232 |
+
self.kcpp_tensor_split = []
|
233 |
+
for s in splits:
|
234 |
+
self.kcpp_tensor_split.append(int(s))
|
235 |
+
|
236 |
+
accel = parameters["kcpp_accelerator"]
|
237 |
+
if accel==0:
|
238 |
+
self.kcpp_noblas = True
|
239 |
+
elif accel==1:
|
240 |
+
pass
|
241 |
+
elif accel==2:
|
242 |
+
self.kcpp_usecublas = ["normal"]
|
243 |
+
elif accel==3:
|
244 |
+
self.kcpp_useclblast = [0,0]
|
245 |
+
elif accel==4:
|
246 |
+
self.kcpp_useclblast = [1,0]
|
247 |
+
elif accel==5:
|
248 |
+
self.kcpp_useclblast = [0,1]
|
249 |
+
elif accel==6:
|
250 |
+
self.kcpp_noavx2 = True
|
251 |
+
elif accel==7:
|
252 |
+
self.kcpp_noavx2 = True
|
253 |
+
self.kcpp_noblas = True
|
254 |
+
self.kcpp_nommap = True
|
255 |
+
pass
|
256 |
+
|
257 |
+
def unload(self):
|
258 |
+
print("Attemping to unload library")
|
259 |
+
koboldcpp.unload_libs()
|
260 |
+
global kcpp_backend_loaded
|
261 |
+
kcpp_backend_loaded = False
|
262 |
+
pass
|
263 |
+
|
264 |
+
def _load(self, save_model: bool, initial_load: bool) -> None:
|
265 |
+
global kcpp_backend_loaded
|
266 |
+
self.tokenizer = self._get_tokenizer("gpt2")
|
267 |
+
if not kcpp_backend_loaded:
|
268 |
+
kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename,
|
269 |
+
port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads,
|
270 |
+
psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize,
|
271 |
+
blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext,
|
272 |
+
unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
|
273 |
+
usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
|
274 |
+
useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, onready='', multiuser=False)
|
275 |
+
|
276 |
+
koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
|
277 |
+
kcpp_backend_loaded = True
|
278 |
+
pass
|
279 |
+
|
280 |
+
def _save_settings(self):
|
281 |
+
pass
|
282 |
+
|
283 |
+
def _raw_generate(
|
284 |
+
self,
|
285 |
+
prompt_tokens: Union[List[int], torch.Tensor],
|
286 |
+
max_new: int,
|
287 |
+
gen_settings: GenerationSettings,
|
288 |
+
single_line: bool = False,
|
289 |
+
batch_count: int = 1,
|
290 |
+
seed: Optional[int] = None,
|
291 |
+
**kwargs,
|
292 |
+
) -> GenerationResult:
|
293 |
+
|
294 |
+
decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
|
295 |
+
|
296 |
+
# Store context in memory to use it for comparison with generated content
|
297 |
+
utils.koboldai_vars.lastctx = decoded_prompt
|
298 |
+
|
299 |
+
genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length,
|
300 |
+
gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
|
301 |
+
gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range,
|
302 |
+
sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids)
|
303 |
+
|
304 |
+
outputs = [genresult]
|
305 |
+
return GenerationResult(
|
306 |
+
model=self,
|
307 |
+
out_batches=np.array(
|
308 |
+
[self.tokenizer.encode(x) for x in outputs]
|
309 |
+
),
|
310 |
+
prompt=prompt_tokens,
|
311 |
+
is_whole_generation=True,
|
312 |
+
single_line=single_line,
|
313 |
+
)
|
codecov.yml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
comment: off
|
2 |
+
|
3 |
+
coverage:
|
4 |
+
status:
|
5 |
+
project:
|
6 |
+
default:
|
7 |
+
target: auto
|
8 |
+
threshold: 0
|
9 |
+
base: auto
|
10 |
+
patch:
|
11 |
+
default:
|
12 |
+
target: auto
|
13 |
+
threshold: 0
|
14 |
+
base: auto
|
colab.ipynb
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"nbformat": 4,
|
3 |
+
"nbformat_minor": 0,
|
4 |
+
"metadata": {
|
5 |
+
"colab": {
|
6 |
+
"private_outputs": true,
|
7 |
+
"provenance": [],
|
8 |
+
"gpuType": "T4",
|
9 |
+
"authorship_tag": "ABX9TyOv14c2MWENhO6RJ3uy6vD7",
|
10 |
+
"include_colab_link": true
|
11 |
+
},
|
12 |
+
"kernelspec": {
|
13 |
+
"name": "python3",
|
14 |
+
"display_name": "Python 3"
|
15 |
+
},
|
16 |
+
"language_info": {
|
17 |
+
"name": "python"
|
18 |
+
},
|
19 |
+
"accelerator": "GPU"
|
20 |
+
},
|
21 |
+
"cells": [
|
22 |
+
{
|
23 |
+
"cell_type": "markdown",
|
24 |
+
"metadata": {
|
25 |
+
"id": "view-in-github",
|
26 |
+
"colab_type": "text"
|
27 |
+
},
|
28 |
+
"source": [
|
29 |
+
"<a href=\"https://colab.research.google.com/github/henk717/koboldcpp/blob/concedo/colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": null,
|
35 |
+
"metadata": {
|
36 |
+
"cellView": "form",
|
37 |
+
"id": "uJS9i_Dltv8Y"
|
38 |
+
},
|
39 |
+
"outputs": [],
|
40 |
+
"source": [
|
41 |
+
"#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\n",
|
42 |
+
"\n",
|
43 |
+
"Model = \"https://huggingface.co/TheBloke/airoboros-l2-13B-gpt4-1.4.1-GGML/resolve/main/airoboros-l2-13b-gpt4-1.4.1.ggmlv3.q4_0.bin\" #@param [\"\"]{allow-input: true}\n",
|
44 |
+
"Layers = 43 #@param [43]{allow-input: true}\n",
|
45 |
+
"\n",
|
46 |
+
"%cd /content\n",
|
47 |
+
"!git clone https://github.com/LostRuins/koboldcpp\n",
|
48 |
+
"%cd /content/koboldcpp\n",
|
49 |
+
"!make LLAMA_CUBLAS=1\n",
|
50 |
+
"\n",
|
51 |
+
"!wget $Model -O model.ggml\n",
|
52 |
+
"!wget -c https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64\n",
|
53 |
+
"!chmod +x cloudflared-linux-amd64\n",
|
54 |
+
"!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\n",
|
55 |
+
"!sleep 10\n",
|
56 |
+
"!cat nohup.out\n",
|
57 |
+
"!python koboldcpp.py model.ggml --stream --usecublas 0 --gpulayers $Layers --hordeconfig concedo\n"
|
58 |
+
]
|
59 |
+
}
|
60 |
+
]
|
61 |
+
}
|
common/CMakeLists.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# common
|
2 |
+
|
3 |
+
set(TARGET common)
|
4 |
+
|
5 |
+
add_library(${TARGET} OBJECT
|
6 |
+
common.h
|
7 |
+
common.cpp
|
8 |
+
console.h
|
9 |
+
console.cpp
|
10 |
+
grammar-parser.h
|
11 |
+
grammar-parser.cpp
|
12 |
+
)
|
13 |
+
|
14 |
+
if (BUILD_SHARED_LIBS)
|
15 |
+
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
16 |
+
endif()
|
17 |
+
|
18 |
+
target_include_directories(${TARGET} PUBLIC .)
|
19 |
+
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
20 |
+
target_link_libraries(${TARGET} PRIVATE llama)
|
common/common.cpp
ADDED
@@ -0,0 +1,1270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "common.h"
|
2 |
+
#include "build-info.h"
|
3 |
+
#include "llama.h"
|
4 |
+
|
5 |
+
#include <algorithm>
|
6 |
+
#include <cassert>
|
7 |
+
#include <cmath>
|
8 |
+
#include <cstring>
|
9 |
+
#include <ctime>
|
10 |
+
#include <fstream>
|
11 |
+
#include <iterator>
|
12 |
+
#include <iostream>
|
13 |
+
#include <regex>
|
14 |
+
#include <sstream>
|
15 |
+
#include <string>
|
16 |
+
#include <unordered_set>
|
17 |
+
#include <vector>
|
18 |
+
#include <cinttypes>
|
19 |
+
|
20 |
+
#if defined(__APPLE__) && defined(__MACH__)
|
21 |
+
#include <sys/types.h>
|
22 |
+
#include <sys/sysctl.h>
|
23 |
+
#endif
|
24 |
+
|
25 |
+
#if defined(_WIN32)
|
26 |
+
#define WIN32_LEAN_AND_MEAN
|
27 |
+
#ifndef NOMINMAX
|
28 |
+
# define NOMINMAX
|
29 |
+
#endif
|
30 |
+
#include <codecvt>
|
31 |
+
#include <locale>
|
32 |
+
#include <windows.h>
|
33 |
+
#include <fcntl.h>
|
34 |
+
#include <io.h>
|
35 |
+
#else
|
36 |
+
#include <sys/ioctl.h>
|
37 |
+
#include <sys/stat.h>
|
38 |
+
#include <unistd.h>
|
39 |
+
#endif
|
40 |
+
|
41 |
+
#if defined(_MSC_VER)
|
42 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
43 |
+
#endif
|
44 |
+
|
45 |
+
int32_t get_num_physical_cores() {
|
46 |
+
#ifdef __linux__
|
47 |
+
// enumerate the set of thread siblings, num entries is num cores
|
48 |
+
std::unordered_set<std::string> siblings;
|
49 |
+
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
|
50 |
+
std::ifstream thread_siblings("/sys/devices/system/cpu"
|
51 |
+
+ std::to_string(cpu) + "/topology/thread_siblings");
|
52 |
+
if (!thread_siblings.is_open()) {
|
53 |
+
break; // no more cpus
|
54 |
+
}
|
55 |
+
std::string line;
|
56 |
+
if (std::getline(thread_siblings, line)) {
|
57 |
+
siblings.insert(line);
|
58 |
+
}
|
59 |
+
}
|
60 |
+
if (!siblings.empty()) {
|
61 |
+
return static_cast<int32_t>(siblings.size());
|
62 |
+
}
|
63 |
+
#elif defined(__APPLE__) && defined(__MACH__)
|
64 |
+
int32_t num_physical_cores;
|
65 |
+
size_t len = sizeof(num_physical_cores);
|
66 |
+
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
67 |
+
if (result == 0) {
|
68 |
+
return num_physical_cores;
|
69 |
+
}
|
70 |
+
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
71 |
+
if (result == 0) {
|
72 |
+
return num_physical_cores;
|
73 |
+
}
|
74 |
+
#elif defined(_WIN32)
|
75 |
+
//TODO: Implement
|
76 |
+
#endif
|
77 |
+
unsigned int n_threads = std::thread::hardware_concurrency();
|
78 |
+
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
79 |
+
}
|
80 |
+
|
81 |
+
static void process_escapes(std::string& input) {
|
82 |
+
std::size_t input_len = input.length();
|
83 |
+
std::size_t output_idx = 0;
|
84 |
+
|
85 |
+
for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
|
86 |
+
if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
|
87 |
+
switch (input[++input_idx]) {
|
88 |
+
case 'n': input[output_idx++] = '\n'; break;
|
89 |
+
case 'r': input[output_idx++] = '\r'; break;
|
90 |
+
case 't': input[output_idx++] = '\t'; break;
|
91 |
+
case '\'': input[output_idx++] = '\''; break;
|
92 |
+
case '\"': input[output_idx++] = '\"'; break;
|
93 |
+
case '\\': input[output_idx++] = '\\'; break;
|
94 |
+
default: input[output_idx++] = '\\';
|
95 |
+
input[output_idx++] = input[input_idx]; break;
|
96 |
+
}
|
97 |
+
} else {
|
98 |
+
input[output_idx++] = input[input_idx];
|
99 |
+
}
|
100 |
+
}
|
101 |
+
|
102 |
+
input.resize(output_idx);
|
103 |
+
}
|
104 |
+
|
105 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
106 |
+
bool invalid_param = false;
|
107 |
+
std::string arg;
|
108 |
+
gpt_params default_params;
|
109 |
+
const std::string arg_prefix = "--";
|
110 |
+
|
111 |
+
for (int i = 1; i < argc; i++) {
|
112 |
+
arg = argv[i];
|
113 |
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
114 |
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
115 |
+
}
|
116 |
+
|
117 |
+
if (arg == "-s" || arg == "--seed") {
|
118 |
+
if (++i >= argc) {
|
119 |
+
invalid_param = true;
|
120 |
+
break;
|
121 |
+
}
|
122 |
+
params.seed = std::stoul(argv[i]);
|
123 |
+
} else if (arg == "-t" || arg == "--threads") {
|
124 |
+
if (++i >= argc) {
|
125 |
+
invalid_param = true;
|
126 |
+
break;
|
127 |
+
}
|
128 |
+
params.n_threads = std::stoi(argv[i]);
|
129 |
+
if (params.n_threads <= 0) {
|
130 |
+
params.n_threads = std::thread::hardware_concurrency();
|
131 |
+
}
|
132 |
+
} else if (arg == "-p" || arg == "--prompt") {
|
133 |
+
if (++i >= argc) {
|
134 |
+
invalid_param = true;
|
135 |
+
break;
|
136 |
+
}
|
137 |
+
params.prompt = argv[i];
|
138 |
+
} else if (arg == "-e" || arg == "--escape") {
|
139 |
+
params.escape = true;
|
140 |
+
} else if (arg == "--prompt-cache") {
|
141 |
+
if (++i >= argc) {
|
142 |
+
invalid_param = true;
|
143 |
+
break;
|
144 |
+
}
|
145 |
+
params.path_prompt_cache = argv[i];
|
146 |
+
} else if (arg == "--prompt-cache-all") {
|
147 |
+
params.prompt_cache_all = true;
|
148 |
+
} else if (arg == "--prompt-cache-ro") {
|
149 |
+
params.prompt_cache_ro = true;
|
150 |
+
} else if (arg == "-f" || arg == "--file") {
|
151 |
+
if (++i >= argc) {
|
152 |
+
invalid_param = true;
|
153 |
+
break;
|
154 |
+
}
|
155 |
+
std::ifstream file(argv[i]);
|
156 |
+
if (!file) {
|
157 |
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
158 |
+
invalid_param = true;
|
159 |
+
break;
|
160 |
+
}
|
161 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
162 |
+
if (params.prompt.back() == '\n') {
|
163 |
+
params.prompt.pop_back();
|
164 |
+
}
|
165 |
+
} else if (arg == "-n" || arg == "--n-predict") {
|
166 |
+
if (++i >= argc) {
|
167 |
+
invalid_param = true;
|
168 |
+
break;
|
169 |
+
}
|
170 |
+
params.n_predict = std::stoi(argv[i]);
|
171 |
+
} else if (arg == "--top-k") {
|
172 |
+
if (++i >= argc) {
|
173 |
+
invalid_param = true;
|
174 |
+
break;
|
175 |
+
}
|
176 |
+
params.top_k = std::stoi(argv[i]);
|
177 |
+
} else if (arg == "-c" || arg == "--ctx-size") {
|
178 |
+
if (++i >= argc) {
|
179 |
+
invalid_param = true;
|
180 |
+
break;
|
181 |
+
}
|
182 |
+
params.n_ctx = std::stoi(argv[i]);
|
183 |
+
} else if (arg == "--rope-freq-base") {
|
184 |
+
if (++i >= argc) {
|
185 |
+
invalid_param = true;
|
186 |
+
break;
|
187 |
+
}
|
188 |
+
params.rope_freq_base = std::stof(argv[i]);
|
189 |
+
} else if (arg == "--rope-freq-scale") {
|
190 |
+
if (++i >= argc) {
|
191 |
+
invalid_param = true;
|
192 |
+
break;
|
193 |
+
}
|
194 |
+
params.rope_freq_scale = std::stof(argv[i]);
|
195 |
+
} else if (arg == "--rope-scale") {
|
196 |
+
if (++i >= argc) {
|
197 |
+
invalid_param = true;
|
198 |
+
break;
|
199 |
+
}
|
200 |
+
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
201 |
+
} else if (arg == "--memory-f32") {
|
202 |
+
params.memory_f16 = false;
|
203 |
+
} else if (arg == "--top-p") {
|
204 |
+
if (++i >= argc) {
|
205 |
+
invalid_param = true;
|
206 |
+
break;
|
207 |
+
}
|
208 |
+
params.top_p = std::stof(argv[i]);
|
209 |
+
} else if (arg == "--temp") {
|
210 |
+
if (++i >= argc) {
|
211 |
+
invalid_param = true;
|
212 |
+
break;
|
213 |
+
}
|
214 |
+
params.temp = std::stof(argv[i]);
|
215 |
+
} else if (arg == "--tfs") {
|
216 |
+
if (++i >= argc) {
|
217 |
+
invalid_param = true;
|
218 |
+
break;
|
219 |
+
}
|
220 |
+
params.tfs_z = std::stof(argv[i]);
|
221 |
+
} else if (arg == "--typical") {
|
222 |
+
if (++i >= argc) {
|
223 |
+
invalid_param = true;
|
224 |
+
break;
|
225 |
+
}
|
226 |
+
params.typical_p = std::stof(argv[i]);
|
227 |
+
} else if (arg == "--repeat-last-n") {
|
228 |
+
if (++i >= argc) {
|
229 |
+
invalid_param = true;
|
230 |
+
break;
|
231 |
+
}
|
232 |
+
params.repeat_last_n = std::stoi(argv[i]);
|
233 |
+
} else if (arg == "--repeat-penalty") {
|
234 |
+
if (++i >= argc) {
|
235 |
+
invalid_param = true;
|
236 |
+
break;
|
237 |
+
}
|
238 |
+
params.repeat_penalty = std::stof(argv[i]);
|
239 |
+
} else if (arg == "--frequency-penalty") {
|
240 |
+
if (++i >= argc) {
|
241 |
+
invalid_param = true;
|
242 |
+
break;
|
243 |
+
}
|
244 |
+
params.frequency_penalty = std::stof(argv[i]);
|
245 |
+
} else if (arg == "--presence-penalty") {
|
246 |
+
if (++i >= argc) {
|
247 |
+
invalid_param = true;
|
248 |
+
break;
|
249 |
+
}
|
250 |
+
params.presence_penalty = std::stof(argv[i]);
|
251 |
+
} else if (arg == "--mirostat") {
|
252 |
+
if (++i >= argc) {
|
253 |
+
invalid_param = true;
|
254 |
+
break;
|
255 |
+
}
|
256 |
+
params.mirostat = std::stoi(argv[i]);
|
257 |
+
} else if (arg == "--mirostat-lr") {
|
258 |
+
if (++i >= argc) {
|
259 |
+
invalid_param = true;
|
260 |
+
break;
|
261 |
+
}
|
262 |
+
params.mirostat_eta = std::stof(argv[i]);
|
263 |
+
} else if (arg == "--mirostat-ent") {
|
264 |
+
if (++i >= argc) {
|
265 |
+
invalid_param = true;
|
266 |
+
break;
|
267 |
+
}
|
268 |
+
params.mirostat_tau = std::stof(argv[i]);
|
269 |
+
} else if (arg == "--cfg-negative-prompt") {
|
270 |
+
if (++i >= argc) {
|
271 |
+
invalid_param = true;
|
272 |
+
break;
|
273 |
+
}
|
274 |
+
params.cfg_negative_prompt = argv[i];
|
275 |
+
} else if (arg == "--cfg-negative-prompt-file") {
|
276 |
+
if (++i >= argc) {
|
277 |
+
invalid_param = true;
|
278 |
+
break;
|
279 |
+
}
|
280 |
+
std::ifstream file(argv[i]);
|
281 |
+
if (!file) {
|
282 |
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
283 |
+
invalid_param = true;
|
284 |
+
break;
|
285 |
+
}
|
286 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
287 |
+
if (params.cfg_negative_prompt.back() == '\n') {
|
288 |
+
params.cfg_negative_prompt.pop_back();
|
289 |
+
}
|
290 |
+
} else if (arg == "--cfg-scale") {
|
291 |
+
if (++i >= argc) {
|
292 |
+
invalid_param = true;
|
293 |
+
break;
|
294 |
+
}
|
295 |
+
params.cfg_scale = std::stof(argv[i]);
|
296 |
+
} else if (arg == "-b" || arg == "--batch-size") {
|
297 |
+
if (++i >= argc) {
|
298 |
+
invalid_param = true;
|
299 |
+
break;
|
300 |
+
}
|
301 |
+
params.n_batch = std::stoi(argv[i]);
|
302 |
+
} else if (arg == "--keep") {
|
303 |
+
if (++i >= argc) {
|
304 |
+
invalid_param = true;
|
305 |
+
break;
|
306 |
+
}
|
307 |
+
params.n_keep = std::stoi(argv[i]);
|
308 |
+
} else if (arg == "--draft") {
|
309 |
+
if (++i >= argc) {
|
310 |
+
invalid_param = true;
|
311 |
+
break;
|
312 |
+
}
|
313 |
+
params.n_draft = std::stoi(argv[i]);
|
314 |
+
} else if (arg == "--chunks") {
|
315 |
+
if (++i >= argc) {
|
316 |
+
invalid_param = true;
|
317 |
+
break;
|
318 |
+
}
|
319 |
+
params.n_chunks = std::stoi(argv[i]);
|
320 |
+
} else if (arg == "-m" || arg == "--model") {
|
321 |
+
if (++i >= argc) {
|
322 |
+
invalid_param = true;
|
323 |
+
break;
|
324 |
+
}
|
325 |
+
params.model = argv[i];
|
326 |
+
} else if (arg == "-md" || arg == "--model-draft") {
|
327 |
+
if (++i >= argc) {
|
328 |
+
invalid_param = true;
|
329 |
+
break;
|
330 |
+
}
|
331 |
+
params.model_draft = argv[i];
|
332 |
+
} else if (arg == "-a" || arg == "--alias") {
|
333 |
+
if (++i >= argc) {
|
334 |
+
invalid_param = true;
|
335 |
+
break;
|
336 |
+
}
|
337 |
+
params.model_alias = argv[i];
|
338 |
+
} else if (arg == "--lora") {
|
339 |
+
if (++i >= argc) {
|
340 |
+
invalid_param = true;
|
341 |
+
break;
|
342 |
+
}
|
343 |
+
params.lora_adapter = argv[i];
|
344 |
+
params.use_mmap = false;
|
345 |
+
} else if (arg == "--lora-base") {
|
346 |
+
if (++i >= argc) {
|
347 |
+
invalid_param = true;
|
348 |
+
break;
|
349 |
+
}
|
350 |
+
params.lora_base = argv[i];
|
351 |
+
} else if (arg == "-i" || arg == "--interactive") {
|
352 |
+
params.interactive = true;
|
353 |
+
} else if (arg == "--embedding") {
|
354 |
+
params.embedding = true;
|
355 |
+
} else if (arg == "--interactive-first") {
|
356 |
+
params.interactive_first = true;
|
357 |
+
} else if (arg == "-ins" || arg == "--instruct") {
|
358 |
+
params.instruct = true;
|
359 |
+
} else if (arg == "--multiline-input") {
|
360 |
+
params.multiline_input = true;
|
361 |
+
} else if (arg == "--simple-io") {
|
362 |
+
params.simple_io = true;
|
363 |
+
} else if (arg == "--color") {
|
364 |
+
params.use_color = true;
|
365 |
+
} else if (arg == "--mlock") {
|
366 |
+
params.use_mlock = true;
|
367 |
+
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
368 |
+
if (++i >= argc) {
|
369 |
+
invalid_param = true;
|
370 |
+
break;
|
371 |
+
}
|
372 |
+
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
373 |
+
params.n_gpu_layers = std::stoi(argv[i]);
|
374 |
+
#else
|
375 |
+
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
376 |
+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
377 |
+
#endif
|
378 |
+
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
379 |
+
if (++i >= argc) {
|
380 |
+
invalid_param = true;
|
381 |
+
break;
|
382 |
+
}
|
383 |
+
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
384 |
+
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
385 |
+
#else
|
386 |
+
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
387 |
+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
388 |
+
#endif
|
389 |
+
} else if (arg == "--main-gpu" || arg == "-mg") {
|
390 |
+
if (++i >= argc) {
|
391 |
+
invalid_param = true;
|
392 |
+
break;
|
393 |
+
}
|
394 |
+
#ifdef GGML_USE_CUBLAS
|
395 |
+
params.main_gpu = std::stoi(argv[i]);
|
396 |
+
#else
|
397 |
+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
398 |
+
#endif
|
399 |
+
} else if (arg == "--tensor-split" || arg == "-ts") {
|
400 |
+
if (++i >= argc) {
|
401 |
+
invalid_param = true;
|
402 |
+
break;
|
403 |
+
}
|
404 |
+
#ifdef GGML_USE_CUBLAS
|
405 |
+
std::string arg_next = argv[i];
|
406 |
+
|
407 |
+
// split string by , and /
|
408 |
+
const std::regex regex{R"([,/]+)"};
|
409 |
+
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
410 |
+
std::vector<std::string> split_arg{it, {}};
|
411 |
+
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
412 |
+
|
413 |
+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
414 |
+
if (i < split_arg.size()) {
|
415 |
+
params.tensor_split[i] = std::stof(split_arg[i]);
|
416 |
+
} else {
|
417 |
+
params.tensor_split[i] = 0.0f;
|
418 |
+
}
|
419 |
+
}
|
420 |
+
#else
|
421 |
+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
422 |
+
#endif // GGML_USE_CUBLAS
|
423 |
+
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
424 |
+
#ifdef GGML_USE_CUBLAS
|
425 |
+
params.mul_mat_q = false;
|
426 |
+
#else
|
427 |
+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
|
428 |
+
#endif // GGML_USE_CUBLAS
|
429 |
+
} else if (arg == "--low-vram" || arg == "-lv") {
|
430 |
+
#ifdef GGML_USE_CUBLAS
|
431 |
+
params.low_vram = true;
|
432 |
+
#else
|
433 |
+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
434 |
+
#endif // GGML_USE_CUBLAS
|
435 |
+
} else if (arg == "--no-mmap") {
|
436 |
+
params.use_mmap = false;
|
437 |
+
} else if (arg == "--numa") {
|
438 |
+
params.numa = true;
|
439 |
+
} else if (arg == "--export") {
|
440 |
+
params.export_cgraph = true;
|
441 |
+
} else if (arg == "--verbose-prompt") {
|
442 |
+
params.verbose_prompt = true;
|
443 |
+
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
444 |
+
if (++i >= argc) {
|
445 |
+
invalid_param = true;
|
446 |
+
break;
|
447 |
+
}
|
448 |
+
params.antiprompt.push_back(argv[i]);
|
449 |
+
} else if (arg == "-ld" || arg == "--logdir") {
|
450 |
+
if (++i >= argc) {
|
451 |
+
invalid_param = true;
|
452 |
+
break;
|
453 |
+
}
|
454 |
+
params.logdir = argv[i];
|
455 |
+
|
456 |
+
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
457 |
+
params.logdir += DIRECTORY_SEPARATOR;
|
458 |
+
}
|
459 |
+
} else if (arg == "--perplexity") {
|
460 |
+
params.perplexity = true;
|
461 |
+
} else if (arg == "--ppl-stride") {
|
462 |
+
if (++i >= argc) {
|
463 |
+
invalid_param = true;
|
464 |
+
break;
|
465 |
+
}
|
466 |
+
params.ppl_stride = std::stoi(argv[i]);
|
467 |
+
} else if (arg == "--ppl-output-type") {
|
468 |
+
if (++i >= argc) {
|
469 |
+
invalid_param = true;
|
470 |
+
break;
|
471 |
+
}
|
472 |
+
params.ppl_output_type = std::stoi(argv[i]);
|
473 |
+
} else if (arg == "--hellaswag") {
|
474 |
+
params.hellaswag = true;
|
475 |
+
} else if (arg == "--hellaswag-tasks") {
|
476 |
+
if (++i >= argc) {
|
477 |
+
invalid_param = true;
|
478 |
+
break;
|
479 |
+
}
|
480 |
+
params.hellaswag_tasks = std::stoi(argv[i]);
|
481 |
+
} else if (arg == "--ignore-eos") {
|
482 |
+
params.ignore_eos = true;
|
483 |
+
} else if (arg == "--no-penalize-nl") {
|
484 |
+
params.penalize_nl = false;
|
485 |
+
} else if (arg == "-l" || arg == "--logit-bias") {
|
486 |
+
if (++i >= argc) {
|
487 |
+
invalid_param = true;
|
488 |
+
break;
|
489 |
+
}
|
490 |
+
std::stringstream ss(argv[i]);
|
491 |
+
llama_token key;
|
492 |
+
char sign;
|
493 |
+
std::string value_str;
|
494 |
+
try {
|
495 |
+
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
496 |
+
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
497 |
+
} else {
|
498 |
+
throw std::exception();
|
499 |
+
}
|
500 |
+
} catch (const std::exception&) {
|
501 |
+
invalid_param = true;
|
502 |
+
break;
|
503 |
+
}
|
504 |
+
} else if (arg == "-h" || arg == "--help") {
|
505 |
+
gpt_print_usage(argc, argv, default_params);
|
506 |
+
#ifndef LOG_DISABLE_LOGS
|
507 |
+
log_print_usage();
|
508 |
+
#endif // LOG_DISABLE_LOGS
|
509 |
+
exit(0);
|
510 |
+
} else if (arg == "--random-prompt") {
|
511 |
+
params.random_prompt = true;
|
512 |
+
} else if (arg == "--in-prefix-bos") {
|
513 |
+
params.input_prefix_bos = true;
|
514 |
+
} else if (arg == "--in-prefix") {
|
515 |
+
if (++i >= argc) {
|
516 |
+
invalid_param = true;
|
517 |
+
break;
|
518 |
+
}
|
519 |
+
params.input_prefix = argv[i];
|
520 |
+
} else if (arg == "--in-suffix") {
|
521 |
+
if (++i >= argc) {
|
522 |
+
invalid_param = true;
|
523 |
+
break;
|
524 |
+
}
|
525 |
+
params.input_suffix = argv[i];
|
526 |
+
} else if (arg == "--grammar") {
|
527 |
+
if (++i >= argc) {
|
528 |
+
invalid_param = true;
|
529 |
+
break;
|
530 |
+
}
|
531 |
+
params.grammar = argv[i];
|
532 |
+
} else if (arg == "--grammar-file") {
|
533 |
+
if (++i >= argc) {
|
534 |
+
invalid_param = true;
|
535 |
+
break;
|
536 |
+
}
|
537 |
+
std::ifstream file(argv[i]);
|
538 |
+
if (!file) {
|
539 |
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
540 |
+
invalid_param = true;
|
541 |
+
break;
|
542 |
+
}
|
543 |
+
std::copy(
|
544 |
+
std::istreambuf_iterator<char>(file),
|
545 |
+
std::istreambuf_iterator<char>(),
|
546 |
+
std::back_inserter(params.grammar)
|
547 |
+
);
|
548 |
+
#ifndef LOG_DISABLE_LOGS
|
549 |
+
// Parse args for logging parameters
|
550 |
+
} else if ( log_param_single_parse( argv[i] ) ) {
|
551 |
+
// Do nothing, log_param_single_parse automatically does it's thing
|
552 |
+
// and returns if a match was found and parsed.
|
553 |
+
} else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
|
554 |
+
// We have a matching known parameter requiring an argument,
|
555 |
+
// now we need to check if there is anything after this argv
|
556 |
+
// and flag invalid_param or parse it.
|
557 |
+
if (++i >= argc) {
|
558 |
+
invalid_param = true;
|
559 |
+
break;
|
560 |
+
}
|
561 |
+
if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
|
562 |
+
invalid_param = true;
|
563 |
+
break;
|
564 |
+
}
|
565 |
+
// End of Parse args for logging parameters
|
566 |
+
#endif // LOG_DISABLE_LOGS
|
567 |
+
} else {
|
568 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
569 |
+
gpt_print_usage(argc, argv, default_params);
|
570 |
+
exit(1);
|
571 |
+
}
|
572 |
+
}
|
573 |
+
if (invalid_param) {
|
574 |
+
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
575 |
+
gpt_print_usage(argc, argv, default_params);
|
576 |
+
exit(1);
|
577 |
+
}
|
578 |
+
if (params.prompt_cache_all &&
|
579 |
+
(params.interactive || params.interactive_first ||
|
580 |
+
params.instruct)) {
|
581 |
+
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
|
582 |
+
gpt_print_usage(argc, argv, default_params);
|
583 |
+
exit(1);
|
584 |
+
}
|
585 |
+
|
586 |
+
if (params.escape) {
|
587 |
+
process_escapes(params.prompt);
|
588 |
+
process_escapes(params.input_prefix);
|
589 |
+
process_escapes(params.input_suffix);
|
590 |
+
}
|
591 |
+
|
592 |
+
return true;
|
593 |
+
}
|
594 |
+
|
595 |
+
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
596 |
+
printf("usage: %s [options]\n", argv[0]);
|
597 |
+
printf("\n");
|
598 |
+
printf("options:\n");
|
599 |
+
printf(" -h, --help show this help message and exit\n");
|
600 |
+
printf(" -i, --interactive run in interactive mode\n");
|
601 |
+
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
602 |
+
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
603 |
+
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
604 |
+
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
605 |
+
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
606 |
+
printf(" (can be specified more than once for multiple prompts).\n");
|
607 |
+
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
608 |
+
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
609 |
+
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
610 |
+
printf(" -p PROMPT, --prompt PROMPT\n");
|
611 |
+
printf(" prompt to start generation with (default: empty)\n");
|
612 |
+
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
613 |
+
printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
614 |
+
printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
615 |
+
printf(" not supported with --interactive or other interactive options\n");
|
616 |
+
printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
|
617 |
+
printf(" --random-prompt start with a randomized prompt.\n");
|
618 |
+
printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
|
619 |
+
printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
620 |
+
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
621 |
+
printf(" -f FNAME, --file FNAME\n");
|
622 |
+
printf(" prompt file to start generation.\n");
|
623 |
+
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
624 |
+
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
625 |
+
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
626 |
+
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
627 |
+
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
628 |
+
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
629 |
+
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
630 |
+
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
631 |
+
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
632 |
+
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
633 |
+
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
634 |
+
printf(" --mirostat N use Mirostat sampling.\n");
|
635 |
+
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
636 |
+
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
637 |
+
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
638 |
+
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
639 |
+
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
640 |
+
printf(" modifies the likelihood of token appearing in the completion,\n");
|
641 |
+
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
642 |
+
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
643 |
+
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
|
644 |
+
printf(" --grammar-file FNAME file to read grammar from\n");
|
645 |
+
printf(" --cfg-negative-prompt PROMPT\n");
|
646 |
+
printf(" negative prompt to use for guidance. (default: empty)\n");
|
647 |
+
printf(" --cfg-negative-prompt-file FNAME\n");
|
648 |
+
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
649 |
+
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
650 |
+
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
651 |
+
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
652 |
+
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
653 |
+
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
654 |
+
printf(" --no-penalize-nl do not penalize newline token\n");
|
655 |
+
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
656 |
+
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
657 |
+
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
658 |
+
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
|
659 |
+
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
660 |
+
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
661 |
+
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
662 |
+
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
663 |
+
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
664 |
+
if (llama_mlock_supported()) {
|
665 |
+
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
666 |
+
}
|
667 |
+
if (llama_mmap_supported()) {
|
668 |
+
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
669 |
+
}
|
670 |
+
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
671 |
+
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
672 |
+
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
673 |
+
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
674 |
+
printf(" -ngl N, --n-gpu-layers N\n");
|
675 |
+
printf(" number of layers to store in VRAM\n");
|
676 |
+
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
677 |
+
printf(" number of layers to store in VRAM for the draft model\n");
|
678 |
+
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
679 |
+
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
680 |
+
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
681 |
+
printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
682 |
+
#ifdef GGML_USE_CUBLAS
|
683 |
+
printf(" -nommq, --no-mul-mat-q\n");
|
684 |
+
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
685 |
+
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
686 |
+
#endif // GGML_USE_CUBLAS
|
687 |
+
#endif
|
688 |
+
printf(" --export export the computation graph to 'llama.ggml'\n");
|
689 |
+
printf(" --verbose-prompt print prompt before generation\n");
|
690 |
+
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
691 |
+
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
692 |
+
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
693 |
+
printf(" -m FNAME, --model FNAME\n");
|
694 |
+
printf(" model path (default: %s)\n", params.model.c_str());
|
695 |
+
printf(" -md FNAME, --model-draft FNAME\n");
|
696 |
+
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
697 |
+
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
698 |
+
printf(" path under which to save YAML logs (no logging if unset)\n");
|
699 |
+
printf("\n");
|
700 |
+
}
|
701 |
+
|
702 |
+
std::string gpt_random_prompt(std::mt19937 & rng) {
|
703 |
+
const int r = rng() % 10;
|
704 |
+
switch (r) {
|
705 |
+
case 0: return "So";
|
706 |
+
case 1: return "Once upon a time";
|
707 |
+
case 2: return "When";
|
708 |
+
case 3: return "The";
|
709 |
+
case 4: return "After";
|
710 |
+
case 5: return "If";
|
711 |
+
case 6: return "import";
|
712 |
+
case 7: return "He";
|
713 |
+
case 8: return "She";
|
714 |
+
case 9: return "They";
|
715 |
+
default: return "To";
|
716 |
+
}
|
717 |
+
|
718 |
+
return "The";
|
719 |
+
}
|
720 |
+
|
721 |
+
//
|
722 |
+
// Model utils
|
723 |
+
//
|
724 |
+
|
725 |
+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
726 |
+
auto lparams = llama_context_default_params();
|
727 |
+
|
728 |
+
lparams.n_ctx = params.n_ctx;
|
729 |
+
lparams.n_batch = params.n_batch;
|
730 |
+
if (params.n_gpu_layers != -1) {
|
731 |
+
lparams.n_gpu_layers = params.n_gpu_layers;
|
732 |
+
}
|
733 |
+
lparams.main_gpu = params.main_gpu;
|
734 |
+
lparams.tensor_split = params.tensor_split;
|
735 |
+
lparams.low_vram = params.low_vram;
|
736 |
+
lparams.mul_mat_q = params.mul_mat_q;
|
737 |
+
lparams.seed = params.seed;
|
738 |
+
lparams.f16_kv = params.memory_f16;
|
739 |
+
lparams.use_mmap = params.use_mmap;
|
740 |
+
lparams.use_mlock = params.use_mlock;
|
741 |
+
lparams.logits_all = params.perplexity;
|
742 |
+
lparams.embedding = params.embedding;
|
743 |
+
lparams.rope_freq_base = params.rope_freq_base;
|
744 |
+
lparams.rope_freq_scale = params.rope_freq_scale;
|
745 |
+
|
746 |
+
return lparams;
|
747 |
+
}
|
748 |
+
|
749 |
+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
750 |
+
auto lparams = llama_context_params_from_gpt_params(params);
|
751 |
+
|
752 |
+
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
753 |
+
if (model == NULL) {
|
754 |
+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
755 |
+
return std::make_tuple(nullptr, nullptr);
|
756 |
+
}
|
757 |
+
|
758 |
+
llama_context * lctx = llama_new_context_with_model(model, lparams);
|
759 |
+
if (lctx == NULL) {
|
760 |
+
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
761 |
+
llama_free_model(model);
|
762 |
+
return std::make_tuple(nullptr, nullptr);
|
763 |
+
}
|
764 |
+
|
765 |
+
if (!params.lora_adapter.empty()) {
|
766 |
+
int err = llama_model_apply_lora_from_file(model,
|
767 |
+
params.lora_adapter.c_str(),
|
768 |
+
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
769 |
+
params.n_threads);
|
770 |
+
if (err != 0) {
|
771 |
+
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
772 |
+
llama_free(lctx);
|
773 |
+
llama_free_model(model);
|
774 |
+
return std::make_tuple(nullptr, nullptr);
|
775 |
+
}
|
776 |
+
}
|
777 |
+
|
778 |
+
if (params.ignore_eos) {
|
779 |
+
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
780 |
+
}
|
781 |
+
|
782 |
+
{
|
783 |
+
LOG("warming up the model with an empty run\n");
|
784 |
+
|
785 |
+
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
|
786 |
+
llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
|
787 |
+
llama_reset_timings(lctx);
|
788 |
+
}
|
789 |
+
|
790 |
+
return std::make_tuple(model, lctx);
|
791 |
+
}
|
792 |
+
|
793 |
+
//
|
794 |
+
// Vocab utils
|
795 |
+
//
|
796 |
+
|
797 |
+
std::vector<llama_token> llama_tokenize(
|
798 |
+
struct llama_context * ctx,
|
799 |
+
const std::string & text,
|
800 |
+
bool add_bos) {
|
801 |
+
// upper limit for the number of tokens
|
802 |
+
int n_tokens = text.length() + add_bos;
|
803 |
+
std::vector<llama_token> result(n_tokens);
|
804 |
+
n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
|
805 |
+
if (n_tokens < 0) {
|
806 |
+
result.resize(-n_tokens);
|
807 |
+
int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
|
808 |
+
GGML_ASSERT(check == -n_tokens);
|
809 |
+
} else {
|
810 |
+
result.resize(n_tokens);
|
811 |
+
}
|
812 |
+
return result;
|
813 |
+
}
|
814 |
+
|
815 |
+
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
816 |
+
std::vector<char> result(8, 0);
|
817 |
+
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
818 |
+
if (n_tokens < 0) {
|
819 |
+
result.resize(-n_tokens);
|
820 |
+
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
821 |
+
GGML_ASSERT(check == -n_tokens);
|
822 |
+
} else {
|
823 |
+
result.resize(n_tokens);
|
824 |
+
}
|
825 |
+
|
826 |
+
return std::string(result.data(), result.size());
|
827 |
+
}
|
828 |
+
|
829 |
+
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
830 |
+
const llama_token bos_id = llama_token_bos(ctx);
|
831 |
+
|
832 |
+
std::string piece;
|
833 |
+
std::string result;
|
834 |
+
|
835 |
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
836 |
+
piece = llama_token_to_piece(ctx, tokens[i]);
|
837 |
+
|
838 |
+
// remove the leading space of the first non-BOS token
|
839 |
+
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
840 |
+
piece = piece.substr(1);
|
841 |
+
}
|
842 |
+
|
843 |
+
result += piece;
|
844 |
+
}
|
845 |
+
|
846 |
+
return result;
|
847 |
+
}
|
848 |
+
|
849 |
+
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
850 |
+
std::string piece;
|
851 |
+
std::string result;
|
852 |
+
|
853 |
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
854 |
+
piece = llama_token_to_piece(ctx, tokens[i]);
|
855 |
+
|
856 |
+
result += piece;
|
857 |
+
}
|
858 |
+
|
859 |
+
return result;
|
860 |
+
}
|
861 |
+
|
862 |
+
//
|
863 |
+
// Sampling utils
|
864 |
+
//
|
865 |
+
|
866 |
+
llama_token llama_sample_token(
|
867 |
+
struct llama_context * ctx,
|
868 |
+
struct llama_context * ctx_guidance,
|
869 |
+
struct llama_grammar * grammar,
|
870 |
+
const struct gpt_params & params,
|
871 |
+
const std::vector<llama_token> & last_tokens,
|
872 |
+
std::vector<llama_token_data> & candidates,
|
873 |
+
int idx) {
|
874 |
+
const int n_ctx = llama_n_ctx(ctx);
|
875 |
+
const int n_vocab = llama_n_vocab(ctx);
|
876 |
+
|
877 |
+
const float temp = params.temp;
|
878 |
+
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
879 |
+
const float top_p = params.top_p;
|
880 |
+
const float tfs_z = params.tfs_z;
|
881 |
+
const float typical_p = params.typical_p;
|
882 |
+
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
883 |
+
const float repeat_penalty = params.repeat_penalty;
|
884 |
+
const float alpha_presence = params.presence_penalty;
|
885 |
+
const float alpha_frequency = params.frequency_penalty;
|
886 |
+
const int mirostat = params.mirostat;
|
887 |
+
const float mirostat_tau = params.mirostat_tau;
|
888 |
+
const float mirostat_eta = params.mirostat_eta;
|
889 |
+
const bool penalize_nl = params.penalize_nl;
|
890 |
+
|
891 |
+
llama_token id = 0;
|
892 |
+
|
893 |
+
float * logits = llama_get_logits(ctx) + idx * n_vocab;
|
894 |
+
|
895 |
+
// Apply params.logit_bias map
|
896 |
+
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
897 |
+
logits[it->first] += it->second;
|
898 |
+
}
|
899 |
+
|
900 |
+
candidates.clear();
|
901 |
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
902 |
+
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
903 |
+
}
|
904 |
+
|
905 |
+
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
906 |
+
|
907 |
+
if (ctx_guidance) {
|
908 |
+
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
909 |
+
}
|
910 |
+
|
911 |
+
// apply penalties
|
912 |
+
if (!last_tokens.empty()) {
|
913 |
+
const float nl_logit = logits[llama_token_nl(ctx)];
|
914 |
+
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
915 |
+
|
916 |
+
llama_sample_repetition_penalty(ctx, &cur_p,
|
917 |
+
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
918 |
+
last_n_repeat, repeat_penalty);
|
919 |
+
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
920 |
+
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
921 |
+
last_n_repeat, alpha_frequency, alpha_presence);
|
922 |
+
|
923 |
+
if (!penalize_nl) {
|
924 |
+
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
925 |
+
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
926 |
+
cur_p.data[idx].logit = nl_logit;
|
927 |
+
break;
|
928 |
+
}
|
929 |
+
}
|
930 |
+
}
|
931 |
+
}
|
932 |
+
|
933 |
+
if (grammar != NULL) {
|
934 |
+
llama_sample_grammar(ctx, &cur_p, grammar);
|
935 |
+
}
|
936 |
+
|
937 |
+
if (temp <= 0) {
|
938 |
+
// Greedy sampling
|
939 |
+
id = llama_sample_token_greedy(ctx, &cur_p);
|
940 |
+
} else {
|
941 |
+
if (mirostat == 1) {
|
942 |
+
static float mirostat_mu = 2.0f * mirostat_tau;
|
943 |
+
const int mirostat_m = 100;
|
944 |
+
llama_sample_temperature(ctx, &cur_p, temp);
|
945 |
+
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
946 |
+
} else if (mirostat == 2) {
|
947 |
+
static float mirostat_mu = 2.0f * mirostat_tau;
|
948 |
+
llama_sample_temperature(ctx, &cur_p, temp);
|
949 |
+
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
950 |
+
} else {
|
951 |
+
// Temperature sampling
|
952 |
+
llama_sample_top_k (ctx, &cur_p, top_k, 1);
|
953 |
+
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
|
954 |
+
llama_sample_typical (ctx, &cur_p, typical_p, 1);
|
955 |
+
llama_sample_top_p (ctx, &cur_p, top_p, 1);
|
956 |
+
llama_sample_temperature(ctx, &cur_p, temp);
|
957 |
+
|
958 |
+
{
|
959 |
+
const int n_top = 10;
|
960 |
+
LOG("top %d candidates:\n", n_top);
|
961 |
+
|
962 |
+
for (int i = 0; i < n_top; i++) {
|
963 |
+
const llama_token id = cur_p.data[i].id;
|
964 |
+
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
965 |
+
}
|
966 |
+
}
|
967 |
+
|
968 |
+
id = llama_sample_token(ctx, &cur_p);
|
969 |
+
|
970 |
+
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
971 |
+
}
|
972 |
+
}
|
973 |
+
// printf("`%d`", candidates_p.size);
|
974 |
+
|
975 |
+
if (grammar != NULL) {
|
976 |
+
llama_grammar_accept_token(ctx, grammar, id);
|
977 |
+
}
|
978 |
+
|
979 |
+
return id;
|
980 |
+
}
|
981 |
+
|
982 |
+
//
|
983 |
+
// YAML utils
|
984 |
+
//
|
985 |
+
|
986 |
+
// returns true if successful, false otherwise
|
987 |
+
bool create_directory_with_parents(const std::string & path) {
|
988 |
+
#ifdef _WIN32
|
989 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
990 |
+
std::wstring wpath = converter.from_bytes(path);
|
991 |
+
|
992 |
+
// if the path already exists, check whether it's a directory
|
993 |
+
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
994 |
+
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
995 |
+
return true;
|
996 |
+
}
|
997 |
+
|
998 |
+
size_t pos_slash = 0;
|
999 |
+
|
1000 |
+
// process path from front to back, procedurally creating directories
|
1001 |
+
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
1002 |
+
const std::wstring subpath = wpath.substr(0, pos_slash);
|
1003 |
+
const wchar_t * test = subpath.c_str();
|
1004 |
+
|
1005 |
+
const bool success = CreateDirectoryW(test, NULL);
|
1006 |
+
if (!success) {
|
1007 |
+
const DWORD error = GetLastError();
|
1008 |
+
|
1009 |
+
// if the path already exists, ensure that it's a directory
|
1010 |
+
if (error == ERROR_ALREADY_EXISTS) {
|
1011 |
+
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
1012 |
+
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
1013 |
+
return false;
|
1014 |
+
}
|
1015 |
+
} else {
|
1016 |
+
return false;
|
1017 |
+
}
|
1018 |
+
}
|
1019 |
+
|
1020 |
+
pos_slash += 1;
|
1021 |
+
}
|
1022 |
+
|
1023 |
+
return true;
|
1024 |
+
#else
|
1025 |
+
// if the path already exists, check whether it's a directory
|
1026 |
+
struct stat info;
|
1027 |
+
if (stat(path.c_str(), &info) == 0) {
|
1028 |
+
return S_ISDIR(info.st_mode);
|
1029 |
+
}
|
1030 |
+
|
1031 |
+
size_t pos_slash = 1; // skip leading slashes for directory creation
|
1032 |
+
|
1033 |
+
// process path from front to back, procedurally creating directories
|
1034 |
+
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
1035 |
+
const std::string subpath = path.substr(0, pos_slash);
|
1036 |
+
struct stat info;
|
1037 |
+
|
1038 |
+
// if the path already exists, ensure that it's a directory
|
1039 |
+
if (stat(subpath.c_str(), &info) == 0) {
|
1040 |
+
if (!S_ISDIR(info.st_mode)) {
|
1041 |
+
return false;
|
1042 |
+
}
|
1043 |
+
} else {
|
1044 |
+
// create parent directories
|
1045 |
+
const int ret = mkdir(subpath.c_str(), 0755);
|
1046 |
+
if (ret != 0) {
|
1047 |
+
return false;
|
1048 |
+
}
|
1049 |
+
}
|
1050 |
+
|
1051 |
+
pos_slash += 1;
|
1052 |
+
}
|
1053 |
+
|
1054 |
+
return true;
|
1055 |
+
#endif // _WIN32
|
1056 |
+
}
|
1057 |
+
|
1058 |
+
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
1059 |
+
if (data.empty()) {
|
1060 |
+
fprintf(stream, "%s:\n", prop_name);
|
1061 |
+
return;
|
1062 |
+
}
|
1063 |
+
|
1064 |
+
fprintf(stream, "%s: [", prop_name);
|
1065 |
+
for (size_t i = 0; i < data.size() - 1; ++i) {
|
1066 |
+
fprintf(stream, "%e, ", data[i]);
|
1067 |
+
}
|
1068 |
+
fprintf(stream, "%e]\n", data.back());
|
1069 |
+
}
|
1070 |
+
|
1071 |
+
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
1072 |
+
if (data.empty()) {
|
1073 |
+
fprintf(stream, "%s:\n", prop_name);
|
1074 |
+
return;
|
1075 |
+
}
|
1076 |
+
|
1077 |
+
fprintf(stream, "%s: [", prop_name);
|
1078 |
+
for (size_t i = 0; i < data.size() - 1; ++i) {
|
1079 |
+
fprintf(stream, "%d, ", data[i]);
|
1080 |
+
}
|
1081 |
+
fprintf(stream, "%d]\n", data.back());
|
1082 |
+
}
|
1083 |
+
|
1084 |
+
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
|
1085 |
+
std::string data_str(data == NULL ? "" : data);
|
1086 |
+
|
1087 |
+
if (data_str.empty()) {
|
1088 |
+
fprintf(stream, "%s:\n", prop_name);
|
1089 |
+
return;
|
1090 |
+
}
|
1091 |
+
|
1092 |
+
size_t pos_start = 0;
|
1093 |
+
size_t pos_found = 0;
|
1094 |
+
|
1095 |
+
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
1096 |
+
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
1097 |
+
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
1098 |
+
data_str = "\"" + data_str + "\"";
|
1099 |
+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
1100 |
+
return;
|
1101 |
+
}
|
1102 |
+
|
1103 |
+
if (data_str.find('\n') == std::string::npos) {
|
1104 |
+
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
1105 |
+
return;
|
1106 |
+
}
|
1107 |
+
|
1108 |
+
fprintf(stream, "%s: |\n", prop_name);
|
1109 |
+
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
1110 |
+
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
1111 |
+
pos_start = pos_found + 1;
|
1112 |
+
}
|
1113 |
+
}
|
1114 |
+
|
1115 |
+
std::string get_sortable_timestamp() {
|
1116 |
+
using clock = std::chrono::system_clock;
|
1117 |
+
|
1118 |
+
const clock::time_point current_time = clock::now();
|
1119 |
+
const time_t as_time_t = clock::to_time_t(current_time);
|
1120 |
+
char timestamp_no_ns[100];
|
1121 |
+
std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
|
1122 |
+
|
1123 |
+
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
1124 |
+
current_time.time_since_epoch() % 1000000000).count();
|
1125 |
+
char timestamp_ns[11];
|
1126 |
+
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
1127 |
+
|
1128 |
+
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
1129 |
+
}
|
1130 |
+
|
1131 |
+
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
1132 |
+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
1133 |
+
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
1134 |
+
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
1135 |
+
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
1136 |
+
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
1137 |
+
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
1138 |
+
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
1139 |
+
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
1140 |
+
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
1141 |
+
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
1142 |
+
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
1143 |
+
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
1144 |
+
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
1145 |
+
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
1146 |
+
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
1147 |
+
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
1148 |
+
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
1149 |
+
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
1150 |
+
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
1151 |
+
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
1152 |
+
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
1153 |
+
|
1154 |
+
#ifdef NDEBUG
|
1155 |
+
fprintf(stream, "debug: false\n");
|
1156 |
+
#else
|
1157 |
+
fprintf(stream, "debug: true\n");
|
1158 |
+
#endif // NDEBUG
|
1159 |
+
|
1160 |
+
fprintf(stream, "model_desc: %s\n", model_desc);
|
1161 |
+
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
|
1162 |
+
|
1163 |
+
#ifdef __OPTIMIZE__
|
1164 |
+
fprintf(stream, "optimize: true\n");
|
1165 |
+
#else
|
1166 |
+
fprintf(stream, "optimize: false\n");
|
1167 |
+
#endif // __OPTIMIZE__
|
1168 |
+
|
1169 |
+
fprintf(stream, "time: %s\n", timestamp.c_str());
|
1170 |
+
|
1171 |
+
fprintf(stream, "\n");
|
1172 |
+
fprintf(stream, "###############\n");
|
1173 |
+
fprintf(stream, "# User Inputs #\n");
|
1174 |
+
fprintf(stream, "###############\n");
|
1175 |
+
fprintf(stream, "\n");
|
1176 |
+
|
1177 |
+
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
1178 |
+
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
1179 |
+
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
|
1180 |
+
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
|
1181 |
+
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
1182 |
+
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
1183 |
+
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
1184 |
+
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
1185 |
+
fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
|
1186 |
+
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
1187 |
+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
|
1188 |
+
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
1189 |
+
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
1190 |
+
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
1191 |
+
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
1192 |
+
|
1193 |
+
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
|
1194 |
+
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
1195 |
+
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
1196 |
+
|
1197 |
+
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
1198 |
+
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
1199 |
+
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
1200 |
+
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
1201 |
+
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
1202 |
+
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
1203 |
+
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
1204 |
+
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
1205 |
+
|
1206 |
+
fprintf(stream, "logit_bias:\n");
|
1207 |
+
for (std::pair<llama_token, float> lb : params.logit_bias) {
|
1208 |
+
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
1209 |
+
continue;
|
1210 |
+
}
|
1211 |
+
fprintf(stream, " %d: %f", lb.first, lb.second);
|
1212 |
+
}
|
1213 |
+
|
1214 |
+
fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
|
1215 |
+
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
1216 |
+
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
|
1217 |
+
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
1218 |
+
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
1219 |
+
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
1220 |
+
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
1221 |
+
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
1222 |
+
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
1223 |
+
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
1224 |
+
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
1225 |
+
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
1226 |
+
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
1227 |
+
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
1228 |
+
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
|
1229 |
+
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
1230 |
+
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
1231 |
+
fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
|
1232 |
+
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
1233 |
+
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
1234 |
+
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
1235 |
+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
|
1236 |
+
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
1237 |
+
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
1238 |
+
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
1239 |
+
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
1240 |
+
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
1241 |
+
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
1242 |
+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
|
1243 |
+
|
1244 |
+
fprintf(stream, "reverse_prompt:\n");
|
1245 |
+
for (std::string ap : params.antiprompt) {
|
1246 |
+
size_t pos = 0;
|
1247 |
+
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
1248 |
+
ap.replace(pos, 1, "\\n");
|
1249 |
+
pos += 1;
|
1250 |
+
}
|
1251 |
+
|
1252 |
+
fprintf(stream, " - %s\n", ap.c_str());
|
1253 |
+
}
|
1254 |
+
|
1255 |
+
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
1256 |
+
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
1257 |
+
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
1258 |
+
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
1259 |
+
fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
|
1260 |
+
|
1261 |
+
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
1262 |
+
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
1263 |
+
|
1264 |
+
fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
|
1265 |
+
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
1266 |
+
fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
|
1267 |
+
fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
|
1268 |
+
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
|
1269 |
+
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
1270 |
+
}
|
common/common.h
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Various helper functions and utilities
|
2 |
+
|
3 |
+
#pragma once
|
4 |
+
|
5 |
+
#include "llama.h"
|
6 |
+
|
7 |
+
#define LOG_NO_FILE_LINE_FUNCTION
|
8 |
+
#include "log.h"
|
9 |
+
|
10 |
+
#include <string>
|
11 |
+
#include <vector>
|
12 |
+
#include <random>
|
13 |
+
#include <thread>
|
14 |
+
#include <unordered_map>
|
15 |
+
#include <tuple>
|
16 |
+
|
17 |
+
#ifdef _WIN32
|
18 |
+
#define DIRECTORY_SEPARATOR '\\'
|
19 |
+
#else
|
20 |
+
#define DIRECTORY_SEPARATOR '/'
|
21 |
+
#endif // _WIN32
|
22 |
+
|
23 |
+
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
24 |
+
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
25 |
+
|
26 |
+
#define print_build_info() do { \
|
27 |
+
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
|
28 |
+
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
|
29 |
+
} while(0)
|
30 |
+
|
31 |
+
//
|
32 |
+
// CLI argument parsing
|
33 |
+
//
|
34 |
+
int32_t get_num_physical_cores();
|
35 |
+
|
36 |
+
struct gpt_params {
|
37 |
+
uint32_t seed = -1; // RNG seed
|
38 |
+
int32_t n_threads = get_num_physical_cores();
|
39 |
+
int32_t n_predict = -1; // new tokens to predict
|
40 |
+
int32_t n_ctx = 512; // context size
|
41 |
+
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
42 |
+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
43 |
+
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
44 |
+
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
45 |
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
46 |
+
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
47 |
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
48 |
+
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
49 |
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
50 |
+
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
51 |
+
float rope_freq_base = 10000.0f; // RoPE base frequency
|
52 |
+
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
53 |
+
|
54 |
+
// sampling parameters
|
55 |
+
int32_t top_k = 40; // <= 0 to use vocab size
|
56 |
+
float top_p = 0.95f; // 1.0 = disabled
|
57 |
+
float tfs_z = 1.00f; // 1.0 = disabled
|
58 |
+
float typical_p = 1.00f; // 1.0 = disabled
|
59 |
+
float temp = 0.80f; // 1.0 = disabled
|
60 |
+
float repeat_penalty = 1.10f; // 1.0 = disabled
|
61 |
+
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
62 |
+
float frequency_penalty = 0.00f; // 0.0 = disabled
|
63 |
+
float presence_penalty = 0.00f; // 0.0 = disabled
|
64 |
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
65 |
+
float mirostat_tau = 5.00f; // target entropy
|
66 |
+
float mirostat_eta = 0.10f; // learning rate
|
67 |
+
|
68 |
+
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
69 |
+
|
70 |
+
// Classifier-Free Guidance
|
71 |
+
// https://arxiv.org/abs/2306.17806
|
72 |
+
std::string cfg_negative_prompt; // string to help guidance
|
73 |
+
float cfg_scale = 1.f; // How strong is guidance
|
74 |
+
|
75 |
+
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
76 |
+
std::string model_draft = ""; // draft model for speculative decoding
|
77 |
+
std::string model_alias = "unknown"; // model alias
|
78 |
+
std::string prompt = "";
|
79 |
+
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
80 |
+
std::string input_prefix = ""; // string to prefix user inputs with
|
81 |
+
std::string input_suffix = ""; // string to suffix user inputs with
|
82 |
+
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
|
83 |
+
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
84 |
+
std::string logdir = ""; // directory in which to save YAML log files
|
85 |
+
|
86 |
+
std::string lora_adapter = ""; // lora adapter path
|
87 |
+
std::string lora_base = ""; // base model path for the lora adapter
|
88 |
+
|
89 |
+
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
90 |
+
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
91 |
+
// (which is more convenient to use for plotting)
|
92 |
+
//
|
93 |
+
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
94 |
+
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
95 |
+
|
96 |
+
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
97 |
+
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
98 |
+
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
99 |
+
bool random_prompt = false; // do not randomize prompt if none provided
|
100 |
+
bool use_color = false; // use color to distinguish generations and inputs
|
101 |
+
bool interactive = false; // interactive mode
|
102 |
+
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
103 |
+
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
104 |
+
|
105 |
+
bool embedding = false; // get only sentence embedding
|
106 |
+
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
107 |
+
bool interactive_first = false; // wait for user input immediately
|
108 |
+
bool multiline_input = false; // reverse the usage of `\`
|
109 |
+
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
110 |
+
|
111 |
+
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
112 |
+
bool ignore_eos = false; // ignore generated EOS tokens
|
113 |
+
bool instruct = false; // instruction mode (used for Alpaca models)
|
114 |
+
bool penalize_nl = true; // consider newlines as a repeatable token
|
115 |
+
bool perplexity = false; // compute perplexity over the prompt
|
116 |
+
bool use_mmap = true; // use mmap for faster loads
|
117 |
+
bool use_mlock = false; // use mlock to keep model in memory
|
118 |
+
bool numa = false; // attempt optimizations that help on some NUMA systems
|
119 |
+
bool export_cgraph = false; // export the computation graph
|
120 |
+
bool verbose_prompt = false; // print prompt tokens before generation
|
121 |
+
};
|
122 |
+
|
123 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
124 |
+
|
125 |
+
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
126 |
+
|
127 |
+
std::string gpt_random_prompt(std::mt19937 & rng);
|
128 |
+
|
129 |
+
//
|
130 |
+
// Model utils
|
131 |
+
//
|
132 |
+
|
133 |
+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
134 |
+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
135 |
+
|
136 |
+
//
|
137 |
+
// Vocab utils
|
138 |
+
//
|
139 |
+
|
140 |
+
// tokenizes a string into a vector of tokens
|
141 |
+
// should work similar to Python's `tokenizer.encode`
|
142 |
+
std::vector<llama_token> llama_tokenize(
|
143 |
+
struct llama_context * ctx,
|
144 |
+
const std::string & text,
|
145 |
+
bool add_bos);
|
146 |
+
|
147 |
+
// tokenizes a token into a piece
|
148 |
+
// should work similar to Python's `tokenizer.id_to_piece`
|
149 |
+
std::string llama_token_to_piece(
|
150 |
+
const struct llama_context * ctx,
|
151 |
+
llama_token token);
|
152 |
+
|
153 |
+
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
154 |
+
// that takes into account the tokenizer type and decides how to handle the leading space
|
155 |
+
//
|
156 |
+
// detokenizes a vector of tokens into a string
|
157 |
+
// should work similar to Python's `tokenizer.decode`
|
158 |
+
// removes the leading space from the first non-BOS token
|
159 |
+
std::string llama_detokenize_spm(
|
160 |
+
llama_context * ctx,
|
161 |
+
const std::vector<llama_token> & tokens);
|
162 |
+
|
163 |
+
// detokenizes a vector of tokens into a string
|
164 |
+
// should work similar to Python's `tokenizer.decode`
|
165 |
+
std::string llama_detokenize_bpe(
|
166 |
+
llama_context * ctx,
|
167 |
+
const std::vector<llama_token> & tokens);
|
168 |
+
|
169 |
+
//
|
170 |
+
// Sampling utils
|
171 |
+
//
|
172 |
+
|
173 |
+
// this is a common sampling function used across the examples for convenience
|
174 |
+
// it can serve as a starting point for implementing your own sampling function
|
175 |
+
//
|
176 |
+
// required:
|
177 |
+
// - ctx: context to use for sampling
|
178 |
+
// - params: sampling parameters
|
179 |
+
//
|
180 |
+
// optional:
|
181 |
+
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
182 |
+
// - grammar: grammar to use for sampling, ignore if NULL
|
183 |
+
// - last_tokens: needed for repetition penalty, ignore if empty
|
184 |
+
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
|
185 |
+
//
|
186 |
+
// returns:
|
187 |
+
// - token: sampled token
|
188 |
+
// - candidates: vector of candidate tokens
|
189 |
+
//
|
190 |
+
llama_token llama_sample_token(
|
191 |
+
struct llama_context * ctx,
|
192 |
+
struct llama_context * ctx_guidance,
|
193 |
+
struct llama_grammar * grammar,
|
194 |
+
const struct gpt_params & params,
|
195 |
+
const std::vector<llama_token> & last_tokens,
|
196 |
+
std::vector<llama_token_data> & candidates,
|
197 |
+
int idx = 0);
|
198 |
+
|
199 |
+
//
|
200 |
+
// YAML utils
|
201 |
+
//
|
202 |
+
|
203 |
+
bool create_directory_with_parents(const std::string & path);
|
204 |
+
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
|
205 |
+
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
|
206 |
+
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
|
207 |
+
std::string get_sortable_timestamp();
|
208 |
+
|
209 |
+
void dump_non_result_info_yaml(
|
210 |
+
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
211 |
+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
common/console.cpp
ADDED
@@ -0,0 +1,501 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "console.h"
|
2 |
+
#include <vector>
|
3 |
+
#include <iostream>
|
4 |
+
|
5 |
+
#if defined(_WIN32)
|
6 |
+
#define WIN32_LEAN_AND_MEAN
|
7 |
+
#ifndef NOMINMAX
|
8 |
+
#define NOMINMAX
|
9 |
+
#endif
|
10 |
+
#include <windows.h>
|
11 |
+
#include <fcntl.h>
|
12 |
+
#include <io.h>
|
13 |
+
#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
|
14 |
+
#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
|
15 |
+
#endif
|
16 |
+
#else
|
17 |
+
#include <climits>
|
18 |
+
#include <sys/ioctl.h>
|
19 |
+
#include <unistd.h>
|
20 |
+
#include <wchar.h>
|
21 |
+
#include <stdio.h>
|
22 |
+
#include <stdlib.h>
|
23 |
+
#include <signal.h>
|
24 |
+
#include <termios.h>
|
25 |
+
#endif
|
26 |
+
|
27 |
+
#define ANSI_COLOR_RED "\x1b[31m"
|
28 |
+
#define ANSI_COLOR_GREEN "\x1b[32m"
|
29 |
+
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
30 |
+
#define ANSI_COLOR_BLUE "\x1b[34m"
|
31 |
+
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
32 |
+
#define ANSI_COLOR_CYAN "\x1b[36m"
|
33 |
+
#define ANSI_COLOR_RESET "\x1b[0m"
|
34 |
+
#define ANSI_BOLD "\x1b[1m"
|
35 |
+
|
36 |
+
namespace console {
|
37 |
+
|
38 |
+
//
|
39 |
+
// Console state
|
40 |
+
//
|
41 |
+
|
42 |
+
static bool advanced_display = false;
|
43 |
+
static bool simple_io = true;
|
44 |
+
static display_t current_display = reset;
|
45 |
+
|
46 |
+
static FILE* out = stdout;
|
47 |
+
|
48 |
+
#if defined (_WIN32)
|
49 |
+
static void* hConsole;
|
50 |
+
#else
|
51 |
+
static FILE* tty = nullptr;
|
52 |
+
static termios initial_state;
|
53 |
+
#endif
|
54 |
+
|
55 |
+
//
|
56 |
+
// Init and cleanup
|
57 |
+
//
|
58 |
+
|
59 |
+
void init(bool use_simple_io, bool use_advanced_display) {
|
60 |
+
advanced_display = use_advanced_display;
|
61 |
+
simple_io = use_simple_io;
|
62 |
+
#if defined(_WIN32)
|
63 |
+
// Windows-specific console initialization
|
64 |
+
DWORD dwMode = 0;
|
65 |
+
hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
|
66 |
+
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
|
67 |
+
hConsole = GetStdHandle(STD_ERROR_HANDLE);
|
68 |
+
if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
|
69 |
+
hConsole = nullptr;
|
70 |
+
simple_io = true;
|
71 |
+
}
|
72 |
+
}
|
73 |
+
if (hConsole) {
|
74 |
+
// Check conditions combined to reduce nesting
|
75 |
+
if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
|
76 |
+
!SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
|
77 |
+
advanced_display = false;
|
78 |
+
}
|
79 |
+
// Set console output codepage to UTF8
|
80 |
+
SetConsoleOutputCP(CP_UTF8);
|
81 |
+
}
|
82 |
+
HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
|
83 |
+
if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
|
84 |
+
// Set console input codepage to UTF16
|
85 |
+
_setmode(_fileno(stdin), _O_WTEXT);
|
86 |
+
|
87 |
+
// Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
|
88 |
+
if (simple_io) {
|
89 |
+
dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
|
90 |
+
} else {
|
91 |
+
dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
|
92 |
+
}
|
93 |
+
if (!SetConsoleMode(hConIn, dwMode)) {
|
94 |
+
simple_io = true;
|
95 |
+
}
|
96 |
+
}
|
97 |
+
#else
|
98 |
+
// POSIX-specific console initialization
|
99 |
+
if (!simple_io) {
|
100 |
+
struct termios new_termios;
|
101 |
+
tcgetattr(STDIN_FILENO, &initial_state);
|
102 |
+
new_termios = initial_state;
|
103 |
+
new_termios.c_lflag &= ~(ICANON | ECHO);
|
104 |
+
new_termios.c_cc[VMIN] = 1;
|
105 |
+
new_termios.c_cc[VTIME] = 0;
|
106 |
+
tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
|
107 |
+
|
108 |
+
tty = fopen("/dev/tty", "w+");
|
109 |
+
if (tty != nullptr) {
|
110 |
+
out = tty;
|
111 |
+
}
|
112 |
+
}
|
113 |
+
|
114 |
+
setlocale(LC_ALL, "");
|
115 |
+
#endif
|
116 |
+
}
|
117 |
+
|
118 |
+
void cleanup() {
|
119 |
+
// Reset console display
|
120 |
+
set_display(reset);
|
121 |
+
|
122 |
+
#if !defined(_WIN32)
|
123 |
+
// Restore settings on POSIX systems
|
124 |
+
if (!simple_io) {
|
125 |
+
if (tty != nullptr) {
|
126 |
+
out = stdout;
|
127 |
+
fclose(tty);
|
128 |
+
tty = nullptr;
|
129 |
+
}
|
130 |
+
tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
|
131 |
+
}
|
132 |
+
#endif
|
133 |
+
}
|
134 |
+
|
135 |
+
//
|
136 |
+
// Display and IO
|
137 |
+
//
|
138 |
+
|
139 |
+
// Keep track of current display and only emit ANSI code if it changes
|
140 |
+
void set_display(display_t display) {
|
141 |
+
if (advanced_display && current_display != display) {
|
142 |
+
fflush(stdout);
|
143 |
+
switch(display) {
|
144 |
+
case reset:
|
145 |
+
fprintf(out, ANSI_COLOR_RESET);
|
146 |
+
break;
|
147 |
+
case prompt:
|
148 |
+
fprintf(out, ANSI_COLOR_YELLOW);
|
149 |
+
break;
|
150 |
+
case user_input:
|
151 |
+
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
|
152 |
+
break;
|
153 |
+
case error:
|
154 |
+
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
|
155 |
+
}
|
156 |
+
current_display = display;
|
157 |
+
fflush(out);
|
158 |
+
}
|
159 |
+
}
|
160 |
+
|
161 |
+
static char32_t getchar32() {
|
162 |
+
#if defined(_WIN32)
|
163 |
+
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
|
164 |
+
wchar_t high_surrogate = 0;
|
165 |
+
|
166 |
+
while (true) {
|
167 |
+
INPUT_RECORD record;
|
168 |
+
DWORD count;
|
169 |
+
if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
|
170 |
+
return WEOF;
|
171 |
+
}
|
172 |
+
|
173 |
+
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
|
174 |
+
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
|
175 |
+
if (wc == 0) {
|
176 |
+
continue;
|
177 |
+
}
|
178 |
+
|
179 |
+
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
|
180 |
+
high_surrogate = wc;
|
181 |
+
continue;
|
182 |
+
}
|
183 |
+
if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
|
184 |
+
if (high_surrogate != 0) { // Check if we have a high surrogate
|
185 |
+
return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
|
186 |
+
}
|
187 |
+
}
|
188 |
+
|
189 |
+
high_surrogate = 0; // Reset the high surrogate
|
190 |
+
return static_cast<char32_t>(wc);
|
191 |
+
}
|
192 |
+
}
|
193 |
+
#else
|
194 |
+
wchar_t wc = getwchar();
|
195 |
+
if (static_cast<wint_t>(wc) == WEOF) {
|
196 |
+
return WEOF;
|
197 |
+
}
|
198 |
+
|
199 |
+
#if WCHAR_MAX == 0xFFFF
|
200 |
+
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
|
201 |
+
wchar_t low_surrogate = getwchar();
|
202 |
+
if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
|
203 |
+
return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
|
204 |
+
}
|
205 |
+
}
|
206 |
+
if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
|
207 |
+
return 0xFFFD; // Return the replacement character U+FFFD
|
208 |
+
}
|
209 |
+
#endif
|
210 |
+
|
211 |
+
return static_cast<char32_t>(wc);
|
212 |
+
#endif
|
213 |
+
}
|
214 |
+
|
215 |
+
static void pop_cursor() {
|
216 |
+
#if defined(_WIN32)
|
217 |
+
if (hConsole != NULL) {
|
218 |
+
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
219 |
+
GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
|
220 |
+
|
221 |
+
COORD newCursorPosition = bufferInfo.dwCursorPosition;
|
222 |
+
if (newCursorPosition.X == 0) {
|
223 |
+
newCursorPosition.X = bufferInfo.dwSize.X - 1;
|
224 |
+
newCursorPosition.Y -= 1;
|
225 |
+
} else {
|
226 |
+
newCursorPosition.X -= 1;
|
227 |
+
}
|
228 |
+
|
229 |
+
SetConsoleCursorPosition(hConsole, newCursorPosition);
|
230 |
+
return;
|
231 |
+
}
|
232 |
+
#endif
|
233 |
+
putc('\b', out);
|
234 |
+
}
|
235 |
+
|
236 |
+
static int estimateWidth(char32_t codepoint) {
|
237 |
+
#if defined(_WIN32)
|
238 |
+
(void)codepoint;
|
239 |
+
return 1;
|
240 |
+
#else
|
241 |
+
return wcwidth(codepoint);
|
242 |
+
#endif
|
243 |
+
}
|
244 |
+
|
245 |
+
static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
|
246 |
+
#if defined(_WIN32)
|
247 |
+
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
248 |
+
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
|
249 |
+
// go with the default
|
250 |
+
return expectedWidth;
|
251 |
+
}
|
252 |
+
COORD initialPosition = bufferInfo.dwCursorPosition;
|
253 |
+
DWORD nNumberOfChars = length;
|
254 |
+
WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
|
255 |
+
|
256 |
+
CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
|
257 |
+
GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
|
258 |
+
|
259 |
+
// Figure out our real position if we're in the last column
|
260 |
+
if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
|
261 |
+
DWORD nNumberOfChars;
|
262 |
+
WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
|
263 |
+
GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
|
264 |
+
}
|
265 |
+
|
266 |
+
int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
|
267 |
+
if (width < 0) {
|
268 |
+
width += newBufferInfo.dwSize.X;
|
269 |
+
}
|
270 |
+
return width;
|
271 |
+
#else
|
272 |
+
// We can trust expectedWidth if we've got one
|
273 |
+
if (expectedWidth >= 0 || tty == nullptr) {
|
274 |
+
fwrite(utf8_codepoint, length, 1, out);
|
275 |
+
return expectedWidth;
|
276 |
+
}
|
277 |
+
|
278 |
+
fputs("\033[6n", tty); // Query cursor position
|
279 |
+
int x1;
|
280 |
+
int y1;
|
281 |
+
int x2;
|
282 |
+
int y2;
|
283 |
+
int results = 0;
|
284 |
+
results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
|
285 |
+
|
286 |
+
fwrite(utf8_codepoint, length, 1, tty);
|
287 |
+
|
288 |
+
fputs("\033[6n", tty); // Query cursor position
|
289 |
+
results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
|
290 |
+
|
291 |
+
if (results != 4) {
|
292 |
+
return expectedWidth;
|
293 |
+
}
|
294 |
+
|
295 |
+
int width = x2 - x1;
|
296 |
+
if (width < 0) {
|
297 |
+
// Calculate the width considering text wrapping
|
298 |
+
struct winsize w;
|
299 |
+
ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
|
300 |
+
width += w.ws_col;
|
301 |
+
}
|
302 |
+
return width;
|
303 |
+
#endif
|
304 |
+
}
|
305 |
+
|
306 |
+
static void replace_last(char ch) {
|
307 |
+
#if defined(_WIN32)
|
308 |
+
pop_cursor();
|
309 |
+
put_codepoint(&ch, 1, 1);
|
310 |
+
#else
|
311 |
+
fprintf(out, "\b%c", ch);
|
312 |
+
#endif
|
313 |
+
}
|
314 |
+
|
315 |
+
static void append_utf8(char32_t ch, std::string & out) {
|
316 |
+
if (ch <= 0x7F) {
|
317 |
+
out.push_back(static_cast<unsigned char>(ch));
|
318 |
+
} else if (ch <= 0x7FF) {
|
319 |
+
out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
|
320 |
+
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
|
321 |
+
} else if (ch <= 0xFFFF) {
|
322 |
+
out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
|
323 |
+
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
|
324 |
+
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
|
325 |
+
} else if (ch <= 0x10FFFF) {
|
326 |
+
out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
|
327 |
+
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
|
328 |
+
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
|
329 |
+
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
|
330 |
+
} else {
|
331 |
+
// Invalid Unicode code point
|
332 |
+
}
|
333 |
+
}
|
334 |
+
|
335 |
+
// Helper function to remove the last UTF-8 character from a string
|
336 |
+
static void pop_back_utf8_char(std::string & line) {
|
337 |
+
if (line.empty()) {
|
338 |
+
return;
|
339 |
+
}
|
340 |
+
|
341 |
+
size_t pos = line.length() - 1;
|
342 |
+
|
343 |
+
// Find the start of the last UTF-8 character (checking up to 4 bytes back)
|
344 |
+
for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
|
345 |
+
if ((line[pos] & 0xC0) != 0x80) {
|
346 |
+
break; // Found the start of the character
|
347 |
+
}
|
348 |
+
}
|
349 |
+
line.erase(pos);
|
350 |
+
}
|
351 |
+
|
352 |
+
static bool readline_advanced(std::string & line, bool multiline_input) {
|
353 |
+
if (out != stdout) {
|
354 |
+
fflush(stdout);
|
355 |
+
}
|
356 |
+
|
357 |
+
line.clear();
|
358 |
+
std::vector<int> widths;
|
359 |
+
bool is_special_char = false;
|
360 |
+
bool end_of_stream = false;
|
361 |
+
|
362 |
+
char32_t input_char;
|
363 |
+
while (true) {
|
364 |
+
fflush(out); // Ensure all output is displayed before waiting for input
|
365 |
+
input_char = getchar32();
|
366 |
+
|
367 |
+
if (input_char == '\r' || input_char == '\n') {
|
368 |
+
break;
|
369 |
+
}
|
370 |
+
|
371 |
+
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
|
372 |
+
end_of_stream = true;
|
373 |
+
break;
|
374 |
+
}
|
375 |
+
|
376 |
+
if (is_special_char) {
|
377 |
+
set_display(user_input);
|
378 |
+
replace_last(line.back());
|
379 |
+
is_special_char = false;
|
380 |
+
}
|
381 |
+
|
382 |
+
if (input_char == '\033') { // Escape sequence
|
383 |
+
char32_t code = getchar32();
|
384 |
+
if (code == '[' || code == 0x1B) {
|
385 |
+
// Discard the rest of the escape sequence
|
386 |
+
while ((code = getchar32()) != (char32_t) WEOF) {
|
387 |
+
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
|
388 |
+
break;
|
389 |
+
}
|
390 |
+
}
|
391 |
+
}
|
392 |
+
} else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
|
393 |
+
if (!widths.empty()) {
|
394 |
+
int count;
|
395 |
+
do {
|
396 |
+
count = widths.back();
|
397 |
+
widths.pop_back();
|
398 |
+
// Move cursor back, print space, and move cursor back again
|
399 |
+
for (int i = 0; i < count; i++) {
|
400 |
+
replace_last(' ');
|
401 |
+
pop_cursor();
|
402 |
+
}
|
403 |
+
pop_back_utf8_char(line);
|
404 |
+
} while (count == 0 && !widths.empty());
|
405 |
+
}
|
406 |
+
} else {
|
407 |
+
int offset = line.length();
|
408 |
+
append_utf8(input_char, line);
|
409 |
+
int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
|
410 |
+
if (width < 0) {
|
411 |
+
width = 0;
|
412 |
+
}
|
413 |
+
widths.push_back(width);
|
414 |
+
}
|
415 |
+
|
416 |
+
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
|
417 |
+
set_display(prompt);
|
418 |
+
replace_last(line.back());
|
419 |
+
is_special_char = true;
|
420 |
+
}
|
421 |
+
}
|
422 |
+
|
423 |
+
bool has_more = multiline_input;
|
424 |
+
if (is_special_char) {
|
425 |
+
replace_last(' ');
|
426 |
+
pop_cursor();
|
427 |
+
|
428 |
+
char last = line.back();
|
429 |
+
line.pop_back();
|
430 |
+
if (last == '\\') {
|
431 |
+
line += '\n';
|
432 |
+
fputc('\n', out);
|
433 |
+
has_more = !has_more;
|
434 |
+
} else {
|
435 |
+
// llama will just eat the single space, it won't act as a space
|
436 |
+
if (line.length() == 1 && line.back() == ' ') {
|
437 |
+
line.clear();
|
438 |
+
pop_cursor();
|
439 |
+
}
|
440 |
+
has_more = false;
|
441 |
+
}
|
442 |
+
} else {
|
443 |
+
if (end_of_stream) {
|
444 |
+
has_more = false;
|
445 |
+
} else {
|
446 |
+
line += '\n';
|
447 |
+
fputc('\n', out);
|
448 |
+
}
|
449 |
+
}
|
450 |
+
|
451 |
+
fflush(out);
|
452 |
+
return has_more;
|
453 |
+
}
|
454 |
+
|
455 |
+
static bool readline_simple(std::string & line, bool multiline_input) {
|
456 |
+
#if defined(_WIN32)
|
457 |
+
std::wstring wline;
|
458 |
+
if (!std::getline(std::wcin, wline)) {
|
459 |
+
// Input stream is bad or EOF received
|
460 |
+
line.clear();
|
461 |
+
GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
|
462 |
+
return false;
|
463 |
+
}
|
464 |
+
|
465 |
+
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
|
466 |
+
line.resize(size_needed);
|
467 |
+
WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
|
468 |
+
#else
|
469 |
+
if (!std::getline(std::cin, line)) {
|
470 |
+
// Input stream is bad or EOF received
|
471 |
+
line.clear();
|
472 |
+
return false;
|
473 |
+
}
|
474 |
+
#endif
|
475 |
+
if (!line.empty()) {
|
476 |
+
char last = line.back();
|
477 |
+
if (last == '/') { // Always return control on '/' symbol
|
478 |
+
line.pop_back();
|
479 |
+
return false;
|
480 |
+
}
|
481 |
+
if (last == '\\') { // '\\' changes the default action
|
482 |
+
line.pop_back();
|
483 |
+
multiline_input = !multiline_input;
|
484 |
+
}
|
485 |
+
}
|
486 |
+
line += '\n';
|
487 |
+
|
488 |
+
// By default, continue input if multiline_input is set
|
489 |
+
return multiline_input;
|
490 |
+
}
|
491 |
+
|
492 |
+
bool readline(std::string & line, bool multiline_input) {
|
493 |
+
set_display(user_input);
|
494 |
+
|
495 |
+
if (simple_io) {
|
496 |
+
return readline_simple(line, multiline_input);
|
497 |
+
}
|
498 |
+
return readline_advanced(line, multiline_input);
|
499 |
+
}
|
500 |
+
|
501 |
+
}
|
common/console.h
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Console functions
|
2 |
+
|
3 |
+
#pragma once
|
4 |
+
|
5 |
+
#include <string>
|
6 |
+
|
7 |
+
namespace console {
|
8 |
+
enum display_t {
|
9 |
+
reset = 0,
|
10 |
+
prompt,
|
11 |
+
user_input,
|
12 |
+
error
|
13 |
+
};
|
14 |
+
|
15 |
+
void init(bool use_simple_io, bool use_advanced_display);
|
16 |
+
void cleanup();
|
17 |
+
void set_display(display_t display);
|
18 |
+
bool readline(std::string & line, bool multiline_input);
|
19 |
+
}
|
common/grammar-parser.cpp
ADDED
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "grammar-parser.h"
|
2 |
+
#include <cstdint>
|
3 |
+
#include <cwchar>
|
4 |
+
#include <string>
|
5 |
+
#include <utility>
|
6 |
+
#include <stdexcept>
|
7 |
+
#include <exception>
|
8 |
+
|
9 |
+
namespace grammar_parser {
|
10 |
+
// NOTE: assumes valid utf8 (but checks for overrun)
|
11 |
+
// copied from llama.cpp
|
12 |
+
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
13 |
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
14 |
+
uint8_t first_byte = static_cast<uint8_t>(*src);
|
15 |
+
uint8_t highbits = first_byte >> 4;
|
16 |
+
int len = lookup[highbits];
|
17 |
+
uint8_t mask = (1 << (8 - len)) - 1;
|
18 |
+
uint32_t value = first_byte & mask;
|
19 |
+
const char * end = src + len; // may overrun!
|
20 |
+
const char * pos = src + 1;
|
21 |
+
for ( ; pos < end && *pos; pos++) {
|
22 |
+
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
23 |
+
}
|
24 |
+
return std::make_pair(value, pos);
|
25 |
+
}
|
26 |
+
|
27 |
+
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
28 |
+
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
29 |
+
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
|
30 |
+
return result.first->second;
|
31 |
+
}
|
32 |
+
|
33 |
+
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
34 |
+
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
35 |
+
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
|
36 |
+
return next_id;
|
37 |
+
}
|
38 |
+
|
39 |
+
static void add_rule(
|
40 |
+
parse_state & state,
|
41 |
+
uint32_t rule_id,
|
42 |
+
const std::vector<llama_grammar_element> & rule) {
|
43 |
+
if (state.rules.size() <= rule_id) {
|
44 |
+
state.rules.resize(rule_id + 1);
|
45 |
+
}
|
46 |
+
state.rules[rule_id] = rule;
|
47 |
+
}
|
48 |
+
|
49 |
+
static bool is_word_char(char c) {
|
50 |
+
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
|
51 |
+
}
|
52 |
+
|
53 |
+
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
54 |
+
const char * pos = src;
|
55 |
+
const char * end = src + size;
|
56 |
+
uint32_t value = 0;
|
57 |
+
for ( ; pos < end && *pos; pos++) {
|
58 |
+
value <<= 4;
|
59 |
+
char c = *pos;
|
60 |
+
if ('a' <= c && c <= 'f') {
|
61 |
+
value += c - 'a' + 10;
|
62 |
+
} else if ('A' <= c && c <= 'F') {
|
63 |
+
value += c - 'A' + 10;
|
64 |
+
} else if ('0' <= c && c <= '9') {
|
65 |
+
value += c - '0';
|
66 |
+
} else {
|
67 |
+
break;
|
68 |
+
}
|
69 |
+
}
|
70 |
+
if (pos != end) {
|
71 |
+
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
|
72 |
+
}
|
73 |
+
return std::make_pair(value, pos);
|
74 |
+
}
|
75 |
+
|
76 |
+
static const char * parse_space(const char * src, bool newline_ok) {
|
77 |
+
const char * pos = src;
|
78 |
+
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
|
79 |
+
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
|
80 |
+
if (*pos == '#') {
|
81 |
+
while (*pos && *pos != '\r' && *pos != '\n') {
|
82 |
+
pos++;
|
83 |
+
}
|
84 |
+
} else {
|
85 |
+
pos++;
|
86 |
+
}
|
87 |
+
}
|
88 |
+
return pos;
|
89 |
+
}
|
90 |
+
|
91 |
+
static const char * parse_name(const char * src) {
|
92 |
+
const char * pos = src;
|
93 |
+
while (is_word_char(*pos)) {
|
94 |
+
pos++;
|
95 |
+
}
|
96 |
+
if (pos == src) {
|
97 |
+
throw std::runtime_error(std::string("expecting name at ") + src);
|
98 |
+
}
|
99 |
+
return pos;
|
100 |
+
}
|
101 |
+
|
102 |
+
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
103 |
+
if (*src == '\\') {
|
104 |
+
switch (src[1]) {
|
105 |
+
case 'x': return parse_hex(src + 2, 2);
|
106 |
+
case 'u': return parse_hex(src + 2, 4);
|
107 |
+
case 'U': return parse_hex(src + 2, 8);
|
108 |
+
case 't': return std::make_pair('\t', src + 2);
|
109 |
+
case 'r': return std::make_pair('\r', src + 2);
|
110 |
+
case 'n': return std::make_pair('\n', src + 2);
|
111 |
+
case '\\':
|
112 |
+
case '"':
|
113 |
+
case '[':
|
114 |
+
case ']':
|
115 |
+
return std::make_pair(src[1], src + 2);
|
116 |
+
default:
|
117 |
+
throw std::runtime_error(std::string("unknown escape at ") + src);
|
118 |
+
}
|
119 |
+
} else if (*src) {
|
120 |
+
return decode_utf8(src);
|
121 |
+
}
|
122 |
+
throw std::runtime_error("unexpected end of input");
|
123 |
+
}
|
124 |
+
|
125 |
+
const char * parse_alternates(
|
126 |
+
parse_state & state,
|
127 |
+
const char * src,
|
128 |
+
const std::string & rule_name,
|
129 |
+
uint32_t rule_id,
|
130 |
+
bool is_nested);
|
131 |
+
|
132 |
+
static const char * parse_sequence(
|
133 |
+
parse_state & state,
|
134 |
+
const char * src,
|
135 |
+
const std::string & rule_name,
|
136 |
+
std::vector<llama_grammar_element> & out_elements,
|
137 |
+
bool is_nested) {
|
138 |
+
size_t last_sym_start = out_elements.size();
|
139 |
+
const char * pos = src;
|
140 |
+
while (*pos) {
|
141 |
+
if (*pos == '"') { // literal string
|
142 |
+
pos++;
|
143 |
+
last_sym_start = out_elements.size();
|
144 |
+
while (*pos != '"') {
|
145 |
+
auto char_pair = parse_char(pos);
|
146 |
+
pos = char_pair.second;
|
147 |
+
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
148 |
+
}
|
149 |
+
pos = parse_space(pos + 1, is_nested);
|
150 |
+
} else if (*pos == '[') { // char range(s)
|
151 |
+
pos++;
|
152 |
+
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
|
153 |
+
if (*pos == '^') {
|
154 |
+
pos++;
|
155 |
+
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
156 |
+
}
|
157 |
+
last_sym_start = out_elements.size();
|
158 |
+
while (*pos != ']') {
|
159 |
+
auto char_pair = parse_char(pos);
|
160 |
+
pos = char_pair.second;
|
161 |
+
enum llama_gretype type = last_sym_start < out_elements.size()
|
162 |
+
? LLAMA_GRETYPE_CHAR_ALT
|
163 |
+
: start_type;
|
164 |
+
|
165 |
+
out_elements.push_back({type, char_pair.first});
|
166 |
+
if (pos[0] == '-' && pos[1] != ']') {
|
167 |
+
auto endchar_pair = parse_char(pos + 1);
|
168 |
+
pos = endchar_pair.second;
|
169 |
+
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
170 |
+
}
|
171 |
+
}
|
172 |
+
pos = parse_space(pos + 1, is_nested);
|
173 |
+
} else if (is_word_char(*pos)) { // rule reference
|
174 |
+
const char * name_end = parse_name(pos);
|
175 |
+
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
|
176 |
+
pos = parse_space(name_end, is_nested);
|
177 |
+
last_sym_start = out_elements.size();
|
178 |
+
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
179 |
+
} else if (*pos == '(') { // grouping
|
180 |
+
// parse nested alternates into synthesized rule
|
181 |
+
pos = parse_space(pos + 1, true);
|
182 |
+
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
183 |
+
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
|
184 |
+
last_sym_start = out_elements.size();
|
185 |
+
// output reference to synthesized rule
|
186 |
+
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
187 |
+
if (*pos != ')') {
|
188 |
+
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
189 |
+
}
|
190 |
+
pos = parse_space(pos + 1, is_nested);
|
191 |
+
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
192 |
+
if (last_sym_start == out_elements.size()) {
|
193 |
+
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
|
194 |
+
}
|
195 |
+
|
196 |
+
// apply transformation to previous symbol (last_sym_start to end) according to
|
197 |
+
// rewrite rules:
|
198 |
+
// S* --> S' ::= S S' |
|
199 |
+
// S+ --> S' ::= S S' | S
|
200 |
+
// S? --> S' ::= S |
|
201 |
+
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
202 |
+
std::vector<llama_grammar_element> sub_rule;
|
203 |
+
// add preceding symbol to generated rule
|
204 |
+
sub_rule.insert(
|
205 |
+
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
206 |
+
if (*pos == '*' || *pos == '+') {
|
207 |
+
// cause generated rule to recurse
|
208 |
+
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
209 |
+
}
|
210 |
+
// mark start of alternate def
|
211 |
+
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
212 |
+
if (*pos == '+') {
|
213 |
+
// add preceding symbol as alternate only for '+' (otherwise empty)
|
214 |
+
sub_rule.insert(
|
215 |
+
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
216 |
+
}
|
217 |
+
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
|
218 |
+
add_rule(state, sub_rule_id, sub_rule);
|
219 |
+
|
220 |
+
// in original rule, replace previous symbol with reference to generated rule
|
221 |
+
out_elements.resize(last_sym_start);
|
222 |
+
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
223 |
+
|
224 |
+
pos = parse_space(pos + 1, is_nested);
|
225 |
+
} else {
|
226 |
+
break;
|
227 |
+
}
|
228 |
+
}
|
229 |
+
return pos;
|
230 |
+
}
|
231 |
+
|
232 |
+
const char * parse_alternates(
|
233 |
+
parse_state & state,
|
234 |
+
const char * src,
|
235 |
+
const std::string & rule_name,
|
236 |
+
uint32_t rule_id,
|
237 |
+
bool is_nested) {
|
238 |
+
std::vector<llama_grammar_element> rule;
|
239 |
+
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
|
240 |
+
while (*pos == '|') {
|
241 |
+
rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
242 |
+
pos = parse_space(pos + 1, true);
|
243 |
+
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
|
244 |
+
}
|
245 |
+
rule.push_back({LLAMA_GRETYPE_END, 0});
|
246 |
+
add_rule(state, rule_id, rule);
|
247 |
+
return pos;
|
248 |
+
}
|
249 |
+
|
250 |
+
static const char * parse_rule(parse_state & state, const char * src) {
|
251 |
+
const char * name_end = parse_name(src);
|
252 |
+
const char * pos = parse_space(name_end, false);
|
253 |
+
size_t name_len = name_end - src;
|
254 |
+
uint32_t rule_id = get_symbol_id(state, src, name_len);
|
255 |
+
const std::string name(src, name_len);
|
256 |
+
|
257 |
+
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
|
258 |
+
throw std::runtime_error(std::string("expecting ::= at ") + pos);
|
259 |
+
}
|
260 |
+
pos = parse_space(pos + 3, true);
|
261 |
+
|
262 |
+
pos = parse_alternates(state, pos, name, rule_id, false);
|
263 |
+
|
264 |
+
if (*pos == '\r') {
|
265 |
+
pos += pos[1] == '\n' ? 2 : 1;
|
266 |
+
} else if (*pos == '\n') {
|
267 |
+
pos++;
|
268 |
+
} else if (*pos) {
|
269 |
+
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
|
270 |
+
}
|
271 |
+
return parse_space(pos, true);
|
272 |
+
}
|
273 |
+
|
274 |
+
parse_state parse(const char * src) {
|
275 |
+
try {
|
276 |
+
parse_state state;
|
277 |
+
const char * pos = parse_space(src, true);
|
278 |
+
while (*pos) {
|
279 |
+
pos = parse_rule(state, pos);
|
280 |
+
}
|
281 |
+
return state;
|
282 |
+
} catch (const std::exception & err) {
|
283 |
+
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
|
284 |
+
return parse_state();
|
285 |
+
}
|
286 |
+
}
|
287 |
+
|
288 |
+
static void print_grammar_char(FILE * file, uint32_t c) {
|
289 |
+
if (0x20 <= c && c <= 0x7f) {
|
290 |
+
fprintf(file, "%c", static_cast<char>(c));
|
291 |
+
} else {
|
292 |
+
// cop out of encoding UTF-8
|
293 |
+
fprintf(file, "<U+%04X>", c);
|
294 |
+
}
|
295 |
+
}
|
296 |
+
|
297 |
+
static bool is_char_element(llama_grammar_element elem) {
|
298 |
+
switch (elem.type) {
|
299 |
+
case LLAMA_GRETYPE_CHAR: return true;
|
300 |
+
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
301 |
+
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
302 |
+
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
303 |
+
default: return false;
|
304 |
+
}
|
305 |
+
}
|
306 |
+
|
307 |
+
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
308 |
+
for (auto elem : rule) {
|
309 |
+
switch (elem.type) {
|
310 |
+
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
|
311 |
+
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
|
312 |
+
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
|
313 |
+
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
|
314 |
+
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
315 |
+
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
316 |
+
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
317 |
+
}
|
318 |
+
switch (elem.type) {
|
319 |
+
case LLAMA_GRETYPE_END:
|
320 |
+
case LLAMA_GRETYPE_ALT:
|
321 |
+
case LLAMA_GRETYPE_RULE_REF:
|
322 |
+
fprintf(file, "(%u) ", elem.value);
|
323 |
+
break;
|
324 |
+
case LLAMA_GRETYPE_CHAR:
|
325 |
+
case LLAMA_GRETYPE_CHAR_NOT:
|
326 |
+
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
327 |
+
case LLAMA_GRETYPE_CHAR_ALT:
|
328 |
+
fprintf(file, "(\"");
|
329 |
+
print_grammar_char(file, elem.value);
|
330 |
+
fprintf(file, "\") ");
|
331 |
+
break;
|
332 |
+
}
|
333 |
+
}
|
334 |
+
fprintf(file, "\n");
|
335 |
+
}
|
336 |
+
|
337 |
+
static void print_rule(
|
338 |
+
FILE * file,
|
339 |
+
uint32_t rule_id,
|
340 |
+
const std::vector<llama_grammar_element> & rule,
|
341 |
+
const std::map<uint32_t, std::string> & symbol_id_names) {
|
342 |
+
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
|
343 |
+
throw std::runtime_error(
|
344 |
+
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
|
345 |
+
}
|
346 |
+
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
|
347 |
+
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
|
348 |
+
llama_grammar_element elem = rule[i];
|
349 |
+
switch (elem.type) {
|
350 |
+
case LLAMA_GRETYPE_END:
|
351 |
+
throw std::runtime_error(
|
352 |
+
"unexpected end of rule: " + std::to_string(rule_id) + "," +
|
353 |
+
std::to_string(i));
|
354 |
+
case LLAMA_GRETYPE_ALT:
|
355 |
+
fprintf(file, "| ");
|
356 |
+
break;
|
357 |
+
case LLAMA_GRETYPE_RULE_REF:
|
358 |
+
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
|
359 |
+
break;
|
360 |
+
case LLAMA_GRETYPE_CHAR:
|
361 |
+
fprintf(file, "[");
|
362 |
+
print_grammar_char(file, elem.value);
|
363 |
+
break;
|
364 |
+
case LLAMA_GRETYPE_CHAR_NOT:
|
365 |
+
fprintf(file, "[^");
|
366 |
+
print_grammar_char(file, elem.value);
|
367 |
+
break;
|
368 |
+
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
369 |
+
if (i == 0 || !is_char_element(rule[i - 1])) {
|
370 |
+
throw std::runtime_error(
|
371 |
+
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
|
372 |
+
std::to_string(rule_id) + "," + std::to_string(i));
|
373 |
+
}
|
374 |
+
fprintf(file, "-");
|
375 |
+
print_grammar_char(file, elem.value);
|
376 |
+
break;
|
377 |
+
case LLAMA_GRETYPE_CHAR_ALT:
|
378 |
+
if (i == 0 || !is_char_element(rule[i - 1])) {
|
379 |
+
throw std::runtime_error(
|
380 |
+
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
|
381 |
+
std::to_string(rule_id) + "," + std::to_string(i));
|
382 |
+
}
|
383 |
+
print_grammar_char(file, elem.value);
|
384 |
+
break;
|
385 |
+
}
|
386 |
+
if (is_char_element(elem)) {
|
387 |
+
switch (rule[i + 1].type) {
|
388 |
+
case LLAMA_GRETYPE_CHAR_ALT:
|
389 |
+
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
390 |
+
break;
|
391 |
+
default:
|
392 |
+
fprintf(file, "] ");
|
393 |
+
}
|
394 |
+
}
|
395 |
+
}
|
396 |
+
fprintf(file, "\n");
|
397 |
+
}
|
398 |
+
|
399 |
+
void print_grammar(FILE * file, const parse_state & state) {
|
400 |
+
try {
|
401 |
+
std::map<uint32_t, std::string> symbol_id_names;
|
402 |
+
for (auto kv : state.symbol_ids) {
|
403 |
+
symbol_id_names[kv.second] = kv.first;
|
404 |
+
}
|
405 |
+
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
406 |
+
// fprintf(file, "%zu: ", i);
|
407 |
+
// print_rule_binary(file, state.rules[i]);
|
408 |
+
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
|
409 |
+
// fprintf(file, "\n");
|
410 |
+
}
|
411 |
+
} catch (const std::exception & err) {
|
412 |
+
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
|
413 |
+
}
|
414 |
+
}
|
415 |
+
|
416 |
+
std::vector<const llama_grammar_element *> parse_state::c_rules() {
|
417 |
+
std::vector<const llama_grammar_element *> ret;
|
418 |
+
ret.reserve(rules.size());
|
419 |
+
for (const auto & rule : rules) {
|
420 |
+
ret.push_back(rule.data());
|
421 |
+
}
|
422 |
+
return ret;
|
423 |
+
}
|
424 |
+
}
|
common/grammar-parser.h
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Implements a parser for an extended Backus-Naur form (BNF), producing the
|
2 |
+
// binary context-free grammar format specified by llama.h. Supports character
|
3 |
+
// ranges, grouping, and repetition operators. As an example, a grammar for
|
4 |
+
// arithmetic might look like:
|
5 |
+
//
|
6 |
+
// root ::= expr
|
7 |
+
// expr ::= term ([-+*/] term)*
|
8 |
+
// term ::= num | "(" space expr ")" space
|
9 |
+
// num ::= [0-9]+ space
|
10 |
+
// space ::= [ \t\n]*
|
11 |
+
|
12 |
+
#pragma once
|
13 |
+
#include "llama.h"
|
14 |
+
#include <vector>
|
15 |
+
#include <map>
|
16 |
+
#include <cstdint>
|
17 |
+
#include <string>
|
18 |
+
|
19 |
+
namespace grammar_parser {
|
20 |
+
struct parse_state {
|
21 |
+
std::map<std::string, uint32_t> symbol_ids;
|
22 |
+
std::vector<std::vector<llama_grammar_element>> rules;
|
23 |
+
|
24 |
+
std::vector<const llama_grammar_element *> c_rules();
|
25 |
+
};
|
26 |
+
|
27 |
+
parse_state parse(const char * src);
|
28 |
+
void print_grammar(FILE * file, const parse_state & state);
|
29 |
+
}
|
common/log.h
ADDED
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include <chrono>
|
4 |
+
#include <cstring>
|
5 |
+
#include <sstream>
|
6 |
+
#include <iostream>
|
7 |
+
#include <thread>
|
8 |
+
#include <vector>
|
9 |
+
#include <algorithm>
|
10 |
+
#include <cinttypes>
|
11 |
+
|
12 |
+
// --------------------------------
|
13 |
+
//
|
14 |
+
// Basic usage:
|
15 |
+
//
|
16 |
+
// --------
|
17 |
+
//
|
18 |
+
// The LOG() and LOG_TEE() macros are ready to go by default
|
19 |
+
// they do not require any initialization.
|
20 |
+
//
|
21 |
+
// LOGLN() and LOG_TEELN() are variants which automatically
|
22 |
+
// include \n character at the end of the log string.
|
23 |
+
//
|
24 |
+
// LOG() behaves exactly like printf, by default writing to a logfile.
|
25 |
+
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
|
26 |
+
//
|
27 |
+
// Default logfile is named
|
28 |
+
// "llama.<threadID>.log"
|
29 |
+
// Default LOG_TEE() secondary output target is
|
30 |
+
// stderr
|
31 |
+
//
|
32 |
+
// Logs can be dynamically disabled or enabled using functions:
|
33 |
+
// log_disable()
|
34 |
+
// and
|
35 |
+
// log_enable()
|
36 |
+
//
|
37 |
+
// A log target can be changed with:
|
38 |
+
// log_set_target( string )
|
39 |
+
// creating and opening, or re-opening a file by string filename
|
40 |
+
// or
|
41 |
+
// log_set_target( FILE* )
|
42 |
+
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
43 |
+
//
|
44 |
+
// --------
|
45 |
+
//
|
46 |
+
// End of Basic usage.
|
47 |
+
//
|
48 |
+
// --------------------------------
|
49 |
+
|
50 |
+
// Specifies a log target.
|
51 |
+
// default uses log_handler() with "llama.log" log file
|
52 |
+
// this can be changed, by defining LOG_TARGET
|
53 |
+
// like so:
|
54 |
+
//
|
55 |
+
// #define LOG_TARGET (a valid FILE*)
|
56 |
+
// #include "log.h"
|
57 |
+
//
|
58 |
+
// or it can be simply redirected to stdout or stderr
|
59 |
+
// like so:
|
60 |
+
//
|
61 |
+
// #define LOG_TARGET stderr
|
62 |
+
// #include "log.h"
|
63 |
+
//
|
64 |
+
// The log target can also be redirected to a diffrent function
|
65 |
+
// like so:
|
66 |
+
//
|
67 |
+
// #define LOG_TARGET log_handler_diffrent()
|
68 |
+
// #include "log.h"
|
69 |
+
//
|
70 |
+
// FILE* log_handler_diffrent()
|
71 |
+
// {
|
72 |
+
// return stderr;
|
73 |
+
// }
|
74 |
+
//
|
75 |
+
// or:
|
76 |
+
//
|
77 |
+
// #define LOG_TARGET log_handler_another_one("somelog.log")
|
78 |
+
// #include "log.h"
|
79 |
+
//
|
80 |
+
// FILE* log_handler_another_one(char*filename)
|
81 |
+
// {
|
82 |
+
// static FILE* logfile = nullptr;
|
83 |
+
// (...)
|
84 |
+
// if( !logfile )
|
85 |
+
// {
|
86 |
+
// fopen(...)
|
87 |
+
// }
|
88 |
+
// (...)
|
89 |
+
// return logfile
|
90 |
+
// }
|
91 |
+
//
|
92 |
+
#ifndef LOG_TARGET
|
93 |
+
#define LOG_TARGET log_handler()
|
94 |
+
#endif
|
95 |
+
|
96 |
+
#ifndef LOG_TEE_TARGET
|
97 |
+
#define LOG_TEE_TARGET stderr
|
98 |
+
#endif
|
99 |
+
|
100 |
+
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
101 |
+
inline std::string log_get_pid()
|
102 |
+
{
|
103 |
+
static std::string pid;
|
104 |
+
if (pid.empty())
|
105 |
+
{
|
106 |
+
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
107 |
+
// it's not the same as "pid" but is unique enough to solve multiple instances
|
108 |
+
// trying to write to the same log.
|
109 |
+
std::stringstream ss;
|
110 |
+
ss << std::this_thread::get_id();
|
111 |
+
pid = ss.str();
|
112 |
+
}
|
113 |
+
|
114 |
+
return pid;
|
115 |
+
}
|
116 |
+
|
117 |
+
// Utility function for generating log file names with unique id based on thread id.
|
118 |
+
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
119 |
+
// where the number is a runtime id of the current thread.
|
120 |
+
|
121 |
+
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
|
122 |
+
|
123 |
+
// INTERNAL, DO NOT USE
|
124 |
+
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
|
125 |
+
{
|
126 |
+
std::stringstream buf;
|
127 |
+
|
128 |
+
buf << log_file_basename;
|
129 |
+
buf << ".";
|
130 |
+
buf << log_get_pid();
|
131 |
+
buf << ".";
|
132 |
+
buf << log_file_extension;
|
133 |
+
|
134 |
+
return buf.str();
|
135 |
+
}
|
136 |
+
|
137 |
+
#ifndef LOG_DEFAULT_FILE_NAME
|
138 |
+
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
|
139 |
+
#endif
|
140 |
+
|
141 |
+
// Utility for turning #define values into string literals
|
142 |
+
// so we can have a define for stderr and
|
143 |
+
// we can print "stderr" instead of literal stderr, etc.
|
144 |
+
#define LOG_STRINGIZE1(s) #s
|
145 |
+
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
|
146 |
+
|
147 |
+
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
|
148 |
+
|
149 |
+
// Allows disabling timestamps.
|
150 |
+
// in order to disable, define LOG_NO_TIMESTAMPS
|
151 |
+
// like so:
|
152 |
+
//
|
153 |
+
// #define LOG_NO_TIMESTAMPS
|
154 |
+
// #include "log.h"
|
155 |
+
//
|
156 |
+
#ifndef LOG_NO_TIMESTAMPS
|
157 |
+
#ifndef _MSC_VER
|
158 |
+
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
159 |
+
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
160 |
+
#else
|
161 |
+
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
162 |
+
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
163 |
+
#endif
|
164 |
+
#else
|
165 |
+
#define LOG_TIMESTAMP_FMT "%s"
|
166 |
+
#define LOG_TIMESTAMP_VAL ,""
|
167 |
+
#endif
|
168 |
+
|
169 |
+
#ifdef LOG_TEE_TIMESTAMPS
|
170 |
+
#ifndef _MSC_VER
|
171 |
+
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
172 |
+
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
173 |
+
#else
|
174 |
+
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
175 |
+
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
176 |
+
#endif
|
177 |
+
#else
|
178 |
+
#define LOG_TEE_TIMESTAMP_FMT "%s"
|
179 |
+
#define LOG_TEE_TIMESTAMP_VAL ,""
|
180 |
+
#endif
|
181 |
+
|
182 |
+
// Allows disabling file/line/function prefix
|
183 |
+
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
|
184 |
+
// like so:
|
185 |
+
//
|
186 |
+
// #define LOG_NO_FILE_LINE_FUNCTION
|
187 |
+
// #include "log.h"
|
188 |
+
//
|
189 |
+
#ifndef LOG_NO_FILE_LINE_FUNCTION
|
190 |
+
#ifndef _MSC_VER
|
191 |
+
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
|
192 |
+
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
193 |
+
#else
|
194 |
+
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
195 |
+
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
196 |
+
#endif
|
197 |
+
#else
|
198 |
+
#define LOG_FLF_FMT "%s"
|
199 |
+
#define LOG_FLF_VAL ,""
|
200 |
+
#endif
|
201 |
+
|
202 |
+
#ifdef LOG_TEE_FILE_LINE_FUNCTION
|
203 |
+
#ifndef _MSC_VER
|
204 |
+
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
|
205 |
+
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
206 |
+
#else
|
207 |
+
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
208 |
+
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
209 |
+
#endif
|
210 |
+
#else
|
211 |
+
#define LOG_TEE_FLF_FMT "%s"
|
212 |
+
#define LOG_TEE_FLF_VAL ,""
|
213 |
+
#endif
|
214 |
+
|
215 |
+
// Utility for synchronizing log configuration state
|
216 |
+
// since std::optional was introduced only in c++17
|
217 |
+
enum LogTriState
|
218 |
+
{
|
219 |
+
LogTriStateSame,
|
220 |
+
LogTriStateFalse,
|
221 |
+
LogTriStateTrue
|
222 |
+
};
|
223 |
+
|
224 |
+
// INTERNAL, DO NOT USE
|
225 |
+
// USE LOG() INSTEAD
|
226 |
+
//
|
227 |
+
#ifndef _MSC_VER
|
228 |
+
#define LOG_IMPL(str, ...) \
|
229 |
+
{ \
|
230 |
+
if (LOG_TARGET != nullptr) \
|
231 |
+
{ \
|
232 |
+
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
233 |
+
fflush(LOG_TARGET); \
|
234 |
+
} \
|
235 |
+
}
|
236 |
+
#else
|
237 |
+
#define LOG_IMPL(str, ...) \
|
238 |
+
{ \
|
239 |
+
if (LOG_TARGET != nullptr) \
|
240 |
+
{ \
|
241 |
+
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
242 |
+
fflush(LOG_TARGET); \
|
243 |
+
} \
|
244 |
+
}
|
245 |
+
#endif
|
246 |
+
|
247 |
+
// INTERNAL, DO NOT USE
|
248 |
+
// USE LOG_TEE() INSTEAD
|
249 |
+
//
|
250 |
+
#ifndef _MSC_VER
|
251 |
+
#define LOG_TEE_IMPL(str, ...) \
|
252 |
+
{ \
|
253 |
+
if (LOG_TARGET != nullptr) \
|
254 |
+
{ \
|
255 |
+
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
256 |
+
fflush(LOG_TARGET); \
|
257 |
+
} \
|
258 |
+
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
259 |
+
{ \
|
260 |
+
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
|
261 |
+
fflush(LOG_TEE_TARGET); \
|
262 |
+
} \
|
263 |
+
}
|
264 |
+
#else
|
265 |
+
#define LOG_TEE_IMPL(str, ...) \
|
266 |
+
{ \
|
267 |
+
if (LOG_TARGET != nullptr) \
|
268 |
+
{ \
|
269 |
+
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
270 |
+
fflush(LOG_TARGET); \
|
271 |
+
} \
|
272 |
+
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
273 |
+
{ \
|
274 |
+
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
|
275 |
+
fflush(LOG_TEE_TARGET); \
|
276 |
+
} \
|
277 |
+
}
|
278 |
+
#endif
|
279 |
+
|
280 |
+
// The '\0' as a last argument, is a trick to bypass the silly
|
281 |
+
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
|
282 |
+
// so we can have a single macro which can be called just like printf.
|
283 |
+
|
284 |
+
// Main LOG macro.
|
285 |
+
// behaves like printf, and supports arguments the exact same way.
|
286 |
+
//
|
287 |
+
#ifndef _MSC_VER
|
288 |
+
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
289 |
+
#else
|
290 |
+
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
|
291 |
+
#endif
|
292 |
+
|
293 |
+
// Main TEE macro.
|
294 |
+
// does the same as LOG
|
295 |
+
// and
|
296 |
+
// simultaneously writes stderr.
|
297 |
+
//
|
298 |
+
// Secondary target can be changed just like LOG_TARGET
|
299 |
+
// by defining LOG_TEE_TARGET
|
300 |
+
//
|
301 |
+
#ifndef _MSC_VER
|
302 |
+
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
303 |
+
#else
|
304 |
+
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
|
305 |
+
#endif
|
306 |
+
|
307 |
+
// LOG macro variants with auto endline.
|
308 |
+
#ifndef _MSC_VER
|
309 |
+
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
310 |
+
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
311 |
+
#else
|
312 |
+
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
|
313 |
+
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
|
314 |
+
#endif
|
315 |
+
|
316 |
+
// INTERNAL, DO NOT USE
|
317 |
+
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
318 |
+
{
|
319 |
+
static bool _initialized{false};
|
320 |
+
static bool _disabled{(filename.empty() && target == nullptr)};
|
321 |
+
static std::string log_current_filename{filename};
|
322 |
+
static FILE *log_current_target{target};
|
323 |
+
static FILE *logfile = nullptr;
|
324 |
+
|
325 |
+
if (change)
|
326 |
+
{
|
327 |
+
if (disable == LogTriStateTrue)
|
328 |
+
{
|
329 |
+
// Disable primary target
|
330 |
+
_disabled = true;
|
331 |
+
}
|
332 |
+
// If previously disabled, only enable, and keep previous target
|
333 |
+
else if (disable == LogTriStateFalse)
|
334 |
+
{
|
335 |
+
_disabled = false;
|
336 |
+
}
|
337 |
+
// Otherwise, process the arguments
|
338 |
+
else if (log_current_filename != filename || log_current_target != target)
|
339 |
+
{
|
340 |
+
_initialized = false;
|
341 |
+
}
|
342 |
+
}
|
343 |
+
|
344 |
+
if (_disabled)
|
345 |
+
{
|
346 |
+
// Log is disabled
|
347 |
+
return nullptr;
|
348 |
+
}
|
349 |
+
|
350 |
+
if (_initialized)
|
351 |
+
{
|
352 |
+
// with fallback in case something went wrong
|
353 |
+
return logfile ? logfile : stderr;
|
354 |
+
}
|
355 |
+
|
356 |
+
// do the (re)initialization
|
357 |
+
if (target != nullptr)
|
358 |
+
{
|
359 |
+
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
360 |
+
{
|
361 |
+
fclose(logfile);
|
362 |
+
}
|
363 |
+
|
364 |
+
log_current_filename = LOG_DEFAULT_FILE_NAME;
|
365 |
+
log_current_target = target;
|
366 |
+
|
367 |
+
logfile = target;
|
368 |
+
}
|
369 |
+
else
|
370 |
+
{
|
371 |
+
if (log_current_filename != filename)
|
372 |
+
{
|
373 |
+
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
374 |
+
{
|
375 |
+
fclose(logfile);
|
376 |
+
}
|
377 |
+
}
|
378 |
+
|
379 |
+
logfile = fopen(filename.c_str(), "w");
|
380 |
+
}
|
381 |
+
|
382 |
+
if (!logfile)
|
383 |
+
{
|
384 |
+
// Verify whether the file was opened, otherwise fallback to stderr
|
385 |
+
logfile = stderr;
|
386 |
+
|
387 |
+
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
|
388 |
+
fflush(stderr);
|
389 |
+
|
390 |
+
// At this point we let the init flag be to true below, and let the target fallback to stderr
|
391 |
+
// otherwise we would repeatedly fopen() which was already unsuccessful
|
392 |
+
}
|
393 |
+
|
394 |
+
_initialized = true;
|
395 |
+
|
396 |
+
return logfile ? logfile : stderr;
|
397 |
+
}
|
398 |
+
|
399 |
+
// INTERNAL, DO NOT USE
|
400 |
+
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
401 |
+
{
|
402 |
+
return log_handler1_impl(change, disable, filename, target);
|
403 |
+
}
|
404 |
+
|
405 |
+
// Disables logs entirely at runtime.
|
406 |
+
// Makes LOG() and LOG_TEE() produce no output,
|
407 |
+
// untill enabled back.
|
408 |
+
#define log_disable() log_disable_impl()
|
409 |
+
|
410 |
+
// INTERNAL, DO NOT USE
|
411 |
+
inline FILE *log_disable_impl()
|
412 |
+
{
|
413 |
+
return log_handler1_impl(true, LogTriStateTrue);
|
414 |
+
}
|
415 |
+
|
416 |
+
// Enables logs at runtime.
|
417 |
+
#define log_enable() log_enable_impl()
|
418 |
+
|
419 |
+
// INTERNAL, DO NOT USE
|
420 |
+
inline FILE *log_enable_impl()
|
421 |
+
{
|
422 |
+
return log_handler1_impl(true, LogTriStateFalse);
|
423 |
+
}
|
424 |
+
|
425 |
+
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
426 |
+
#define log_set_target(target) log_set_target_impl(target)
|
427 |
+
|
428 |
+
// INTERNAL, DO NOT USE
|
429 |
+
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
|
430 |
+
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
|
431 |
+
|
432 |
+
// INTERNAL, DO NOT USE
|
433 |
+
inline FILE *log_handler() { return log_handler1_impl(); }
|
434 |
+
|
435 |
+
inline void log_test()
|
436 |
+
{
|
437 |
+
log_disable();
|
438 |
+
LOG("01 Hello World to nobody, because logs are disabled!\n")
|
439 |
+
log_enable();
|
440 |
+
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
|
441 |
+
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
|
442 |
+
log_set_target(stderr);
|
443 |
+
LOG("04 Hello World to stderr!\n")
|
444 |
+
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
|
445 |
+
log_set_target(LOG_DEFAULT_FILE_NAME);
|
446 |
+
LOG("06 Hello World to default log file!\n")
|
447 |
+
log_set_target(stdout);
|
448 |
+
LOG("07 Hello World to stdout!\n")
|
449 |
+
log_set_target(LOG_DEFAULT_FILE_NAME);
|
450 |
+
LOG("08 Hello World to default log file again!\n")
|
451 |
+
log_disable();
|
452 |
+
LOG("09 Hello World _1_ into the void!\n")
|
453 |
+
log_enable();
|
454 |
+
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
|
455 |
+
log_disable();
|
456 |
+
log_set_target("llama.anotherlog.log");
|
457 |
+
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
|
458 |
+
log_enable();
|
459 |
+
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
|
460 |
+
log_set_target("llama.yetanotherlog.log");
|
461 |
+
LOG("13 Hello World this time in yet new file?\n")
|
462 |
+
log_set_target(log_filename_generator("llama_autonamed", "log"));
|
463 |
+
LOG("14 Hello World in log with generated filename!\n")
|
464 |
+
#ifdef _MSC_VER
|
465 |
+
LOG_TEE("15 Hello msvc TEE without arguments\n")
|
466 |
+
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
|
467 |
+
LOG_TEELN("17 Hello msvc TEELN without arguments\n")
|
468 |
+
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
|
469 |
+
LOG("19 Hello msvc LOG without arguments\n")
|
470 |
+
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
|
471 |
+
LOGLN("21 Hello msvc LOGLN without arguments\n")
|
472 |
+
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
|
473 |
+
#endif
|
474 |
+
}
|
475 |
+
|
476 |
+
inline bool log_param_single_parse(const std::string & param)
|
477 |
+
{
|
478 |
+
if ( param == "--log-test")
|
479 |
+
{
|
480 |
+
log_test();
|
481 |
+
return true;
|
482 |
+
}
|
483 |
+
|
484 |
+
if ( param == "--log-disable")
|
485 |
+
{
|
486 |
+
log_disable();
|
487 |
+
return true;
|
488 |
+
}
|
489 |
+
|
490 |
+
if ( param == "--log-enable")
|
491 |
+
{
|
492 |
+
log_enable();
|
493 |
+
return true;
|
494 |
+
}
|
495 |
+
|
496 |
+
return false;
|
497 |
+
}
|
498 |
+
|
499 |
+
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
|
500 |
+
{
|
501 |
+
if ( param == "--log-file")
|
502 |
+
{
|
503 |
+
if (!check_but_dont_parse)
|
504 |
+
{
|
505 |
+
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
|
506 |
+
}
|
507 |
+
|
508 |
+
return true;
|
509 |
+
}
|
510 |
+
|
511 |
+
return false;
|
512 |
+
}
|
513 |
+
|
514 |
+
inline void log_print_usage()
|
515 |
+
{
|
516 |
+
printf("log options:\n");
|
517 |
+
/* format
|
518 |
+
printf(" -h, --help show this help message and exit\n");*/
|
519 |
+
/* spacing
|
520 |
+
printf("__-param----------------Description\n");*/
|
521 |
+
printf(" --log-test Run simple logging test\n");
|
522 |
+
printf(" --log-disable Disable trace logs\n");
|
523 |
+
printf(" --log-enable Enable trace logs\n");
|
524 |
+
printf(" --log-file Specify a log filename (without extension)\n");
|
525 |
+
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
|
526 |
+
}
|
527 |
+
|
528 |
+
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
529 |
+
|
530 |
+
// INTERNAL, DO NOT USE
|
531 |
+
inline void log_dump_cmdline_impl(int argc, char **argv)
|
532 |
+
{
|
533 |
+
std::stringstream buf;
|
534 |
+
for (int i = 0; i < argc; ++i)
|
535 |
+
{
|
536 |
+
if (std::string(argv[i]).find(' ') != std::string::npos)
|
537 |
+
{
|
538 |
+
buf << " \"" << argv[i] <<"\"";
|
539 |
+
}
|
540 |
+
else
|
541 |
+
{
|
542 |
+
buf << " " << argv[i];
|
543 |
+
}
|
544 |
+
}
|
545 |
+
LOGLN("Cmd:%s", buf.str().c_str())
|
546 |
+
}
|
547 |
+
|
548 |
+
#define log_tostr(var) log_var_to_string_impl(var).c_str()
|
549 |
+
|
550 |
+
inline std::string log_var_to_string_impl(bool var)
|
551 |
+
{
|
552 |
+
return var ? "true" : "false";
|
553 |
+
}
|
554 |
+
|
555 |
+
inline std::string log_var_to_string_impl(std::string var)
|
556 |
+
{
|
557 |
+
return var;
|
558 |
+
}
|
559 |
+
|
560 |
+
inline std::string log_var_to_string_impl(const std::vector<int> & var)
|
561 |
+
{
|
562 |
+
std::stringstream buf;
|
563 |
+
buf << "[ ";
|
564 |
+
bool first = true;
|
565 |
+
for (auto e : var)
|
566 |
+
{
|
567 |
+
if (first)
|
568 |
+
{
|
569 |
+
first = false;
|
570 |
+
}
|
571 |
+
else
|
572 |
+
{
|
573 |
+
buf << ", ";
|
574 |
+
}
|
575 |
+
buf << std::to_string(e);
|
576 |
+
}
|
577 |
+
buf << " ]";
|
578 |
+
|
579 |
+
return buf.str();
|
580 |
+
}
|
581 |
+
|
582 |
+
#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \
|
583 |
+
[&tokens, &ctx]() \
|
584 |
+
{ \
|
585 |
+
std::stringstream buf; \
|
586 |
+
buf << "[ "; \
|
587 |
+
\
|
588 |
+
bool first = true; \
|
589 |
+
for (const auto &token : tokens) \
|
590 |
+
{ \
|
591 |
+
if (!first) \
|
592 |
+
buf << ", "; \
|
593 |
+
else \
|
594 |
+
first = false; \
|
595 |
+
\
|
596 |
+
auto detokenized = llama_token_to_piece(ctx, token); \
|
597 |
+
\
|
598 |
+
detokenized.erase( \
|
599 |
+
std::remove_if( \
|
600 |
+
detokenized.begin(), \
|
601 |
+
detokenized.end(), \
|
602 |
+
[](const unsigned char c) { return !std::isprint(c); }), \
|
603 |
+
detokenized.end()); \
|
604 |
+
\
|
605 |
+
buf \
|
606 |
+
<< "'" << detokenized << "'" \
|
607 |
+
<< ":" << std::to_string(token); \
|
608 |
+
} \
|
609 |
+
buf << " ]"; \
|
610 |
+
\
|
611 |
+
return buf.str(); \
|
612 |
+
}() \
|
613 |
+
.c_str()
|
614 |
+
|
615 |
+
#ifdef LOG_DISABLE_LOGS
|
616 |
+
|
617 |
+
#undef LOG
|
618 |
+
#define LOG(...) // dummy stub
|
619 |
+
#undef LOGLN
|
620 |
+
#define LOGLN(...) // dummy stub
|
621 |
+
|
622 |
+
#undef LOG_TEE
|
623 |
+
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
|
624 |
+
|
625 |
+
#undef LOG_TEELN
|
626 |
+
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
|
627 |
+
|
628 |
+
#undef LOG_DISABLE
|
629 |
+
#define LOG_DISABLE() // dummy stub
|
630 |
+
|
631 |
+
#undef LOG_ENABLE
|
632 |
+
#define LOG_ENABLE() // dummy stub
|
633 |
+
|
634 |
+
#undef LOG_ENABLE
|
635 |
+
#define LOG_ENABLE() // dummy stub
|
636 |
+
|
637 |
+
#undef LOG_SET_TARGET
|
638 |
+
#define LOG_SET_TARGET(...) // dummy stub
|
639 |
+
|
640 |
+
#undef LOG_DUMP_CMDLINE
|
641 |
+
#define LOG_DUMP_CMDLINE(...) // dummy stub
|
642 |
+
|
643 |
+
#endif // LOG_DISABLE_LOGS
|
convert-baichuan-hf-to-gguf.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF baichuan --> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import TYPE_CHECKING, Any
|
13 |
+
import itertools
|
14 |
+
import gguf
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
18 |
+
|
19 |
+
|
20 |
+
if TYPE_CHECKING:
|
21 |
+
from typing import TypeAlias
|
22 |
+
|
23 |
+
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
24 |
+
|
25 |
+
# reverse HF permute back to original pth layout
|
26 |
+
|
27 |
+
|
28 |
+
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
29 |
+
if n_kv_head is not None and n_head != n_kv_head:
|
30 |
+
n_head //= n_kv_head
|
31 |
+
|
32 |
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
33 |
+
.swapaxes(1, 2)
|
34 |
+
.reshape(weights.shape))
|
35 |
+
|
36 |
+
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
37 |
+
r = weights.shape[0] // 3
|
38 |
+
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
39 |
+
|
40 |
+
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
41 |
+
r = weights.shape[0] // 3
|
42 |
+
return weights[r * n_part : r * n_part + r, ...]
|
43 |
+
|
44 |
+
def count_model_parts(dir_model: str) -> int:
|
45 |
+
num_parts = 0
|
46 |
+
|
47 |
+
for filename in os.listdir(dir_model):
|
48 |
+
if filename.startswith("pytorch_model-"):
|
49 |
+
num_parts += 1
|
50 |
+
|
51 |
+
if num_parts > 0:
|
52 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
53 |
+
|
54 |
+
return num_parts
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
def parse_args() -> argparse.Namespace:
|
59 |
+
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
60 |
+
parser.add_argument(
|
61 |
+
"--vocab-only", action="store_true",
|
62 |
+
help="extract only the vocab",
|
63 |
+
)
|
64 |
+
parser.add_argument(
|
65 |
+
"--outfile", type=Path,
|
66 |
+
help="path to write to; default: based on input",
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"model", type=Path,
|
70 |
+
help="directory containing model file, or model file itself (*.bin)",
|
71 |
+
)
|
72 |
+
parser.add_argument(
|
73 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
74 |
+
help="output format - use 0 for float32, 1 for float16",
|
75 |
+
)
|
76 |
+
return parser.parse_args()
|
77 |
+
|
78 |
+
args = parse_args()
|
79 |
+
|
80 |
+
dir_model = args.model
|
81 |
+
ftype = args.ftype
|
82 |
+
if not dir_model.is_dir():
|
83 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
84 |
+
sys.exit(1)
|
85 |
+
|
86 |
+
# possible tensor data types
|
87 |
+
# ftype == 0 -> float32
|
88 |
+
# ftype == 1 -> float16
|
89 |
+
|
90 |
+
# map from ftype to string
|
91 |
+
ftype_str = ["f32", "f16"]
|
92 |
+
|
93 |
+
if args.outfile is not None:
|
94 |
+
fname_out = args.outfile
|
95 |
+
else:
|
96 |
+
# output in the same directory as the model by default
|
97 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
98 |
+
|
99 |
+
print("gguf: loading model "+dir_model.name)
|
100 |
+
|
101 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
102 |
+
hparams = json.load(f)
|
103 |
+
print("hello print: ",hparams["architectures"][0])
|
104 |
+
if hparams["architectures"][0] != "BaichuanForCausalLM":
|
105 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
106 |
+
|
107 |
+
sys.exit()
|
108 |
+
|
109 |
+
# get number of model parts
|
110 |
+
num_parts = count_model_parts(dir_model)
|
111 |
+
print(f"num_parts:{num_parts}\n")
|
112 |
+
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
113 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
114 |
+
|
115 |
+
print("gguf: get model metadata")
|
116 |
+
|
117 |
+
block_count = hparams["num_hidden_layers"]
|
118 |
+
head_count = hparams["num_attention_heads"]
|
119 |
+
|
120 |
+
if "num_key_value_heads" in hparams:
|
121 |
+
head_count_kv = hparams["num_key_value_heads"]
|
122 |
+
else:
|
123 |
+
head_count_kv = head_count
|
124 |
+
|
125 |
+
if "_name_or_path" in hparams:
|
126 |
+
hf_repo = hparams["_name_or_path"]
|
127 |
+
else:
|
128 |
+
hf_repo = ""
|
129 |
+
|
130 |
+
if "max_sequence_length" in hparams:
|
131 |
+
ctx_length = hparams["max_sequence_length"]
|
132 |
+
elif "max_position_embeddings" in hparams:
|
133 |
+
ctx_length = hparams["max_position_embeddings"]
|
134 |
+
elif "model_max_length" in hparams:
|
135 |
+
ctx_length = hparams["model_max_length"]
|
136 |
+
else:
|
137 |
+
print("gguf: can not find ctx length parameter.")
|
138 |
+
|
139 |
+
sys.exit()
|
140 |
+
|
141 |
+
|
142 |
+
gguf_writer.add_name(dir_model.name)
|
143 |
+
gguf_writer.add_source_hf_repo(hf_repo)
|
144 |
+
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
145 |
+
gguf_writer.add_context_length(ctx_length)
|
146 |
+
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
147 |
+
gguf_writer.add_block_count(block_count)
|
148 |
+
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
149 |
+
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
150 |
+
gguf_writer.add_head_count(head_count)
|
151 |
+
gguf_writer.add_head_count_kv(head_count_kv)
|
152 |
+
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
153 |
+
|
154 |
+
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
155 |
+
if "type" in hparams["rope_scaling"]:
|
156 |
+
if hparams["rope_scaling"]["type"] == "linear":
|
157 |
+
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
158 |
+
|
159 |
+
|
160 |
+
# TOKENIZATION
|
161 |
+
|
162 |
+
print("gguf: get tokenizer metadata")
|
163 |
+
|
164 |
+
tokens: list[bytes] = []
|
165 |
+
scores: list[float] = []
|
166 |
+
toktypes: list[int] = []
|
167 |
+
|
168 |
+
tokenizer_model_file = dir_model / 'tokenizer.model'
|
169 |
+
if not tokenizer_model_file.is_file():
|
170 |
+
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
171 |
+
sys.exit(1)
|
172 |
+
|
173 |
+
# vocab type sentencepiece
|
174 |
+
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
175 |
+
|
176 |
+
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
177 |
+
|
178 |
+
for i in range(tokenizer.vocab_size()):
|
179 |
+
text: bytes
|
180 |
+
score: float
|
181 |
+
|
182 |
+
piece = tokenizer.id_to_piece(i)
|
183 |
+
text = piece.encode("utf-8")
|
184 |
+
score = tokenizer.get_score(i)
|
185 |
+
|
186 |
+
toktype = 1 # defualt to normal token type
|
187 |
+
if tokenizer.is_unknown(i):
|
188 |
+
toktype = 2
|
189 |
+
if tokenizer.is_control(i):
|
190 |
+
toktype = 3
|
191 |
+
|
192 |
+
# toktype = 4 is user-defined = tokens from added_tokens.json
|
193 |
+
|
194 |
+
if tokenizer.is_unused(i):
|
195 |
+
toktype = 5
|
196 |
+
if tokenizer.is_byte(i):
|
197 |
+
toktype = 6
|
198 |
+
|
199 |
+
tokens.append(text)
|
200 |
+
scores.append(score)
|
201 |
+
toktypes.append(toktype)
|
202 |
+
|
203 |
+
added_tokens_file = dir_model / 'added_tokens.json'
|
204 |
+
if added_tokens_file.is_file():
|
205 |
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
206 |
+
addtokens_json = json.load(f)
|
207 |
+
|
208 |
+
print("gguf: get added tokens")
|
209 |
+
|
210 |
+
for key in addtokens_json:
|
211 |
+
tokens.append( key.encode("utf-8") )
|
212 |
+
scores.append(-1000.0)
|
213 |
+
toktypes.append(4) # user-defined token type
|
214 |
+
|
215 |
+
|
216 |
+
gguf_writer.add_tokenizer_model("llama")
|
217 |
+
gguf_writer.add_token_list(tokens)
|
218 |
+
gguf_writer.add_token_scores(scores)
|
219 |
+
gguf_writer.add_token_types(toktypes)
|
220 |
+
|
221 |
+
special_vocab = gguf.SpecialVocab(dir_model)
|
222 |
+
special_vocab.add_to_gguf(gguf_writer)
|
223 |
+
|
224 |
+
# TENSORS
|
225 |
+
|
226 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
227 |
+
|
228 |
+
# tensor info
|
229 |
+
print("gguf: get tensor metadata")
|
230 |
+
|
231 |
+
if num_parts == 0:
|
232 |
+
part_names = iter(("pytorch_model.bin",))
|
233 |
+
else:
|
234 |
+
part_names = (
|
235 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
236 |
+
)
|
237 |
+
|
238 |
+
|
239 |
+
for part_name in part_names:
|
240 |
+
if args.vocab_only:
|
241 |
+
break
|
242 |
+
print("gguf: loading model part '" + part_name + "'")
|
243 |
+
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
244 |
+
|
245 |
+
tmp=model_part
|
246 |
+
for i in range(block_count):
|
247 |
+
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
248 |
+
print(f"Unpacking and permuting layer {i}")
|
249 |
+
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
250 |
+
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
251 |
+
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
252 |
+
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
253 |
+
|
254 |
+
for name in model_part.keys():
|
255 |
+
data = model_part[name]
|
256 |
+
# we don't need these
|
257 |
+
if name.endswith(".rotary_emb.inv_freq"):
|
258 |
+
continue
|
259 |
+
|
260 |
+
old_dtype = data.dtype
|
261 |
+
|
262 |
+
# convert any unsupported data types to float32
|
263 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
264 |
+
data = data.to(torch.float32)
|
265 |
+
|
266 |
+
data = data.squeeze().numpy()
|
267 |
+
|
268 |
+
# map tensor names
|
269 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
270 |
+
if new_name is None:
|
271 |
+
print("Can not map tensor '" + name + "'")
|
272 |
+
sys.exit()
|
273 |
+
|
274 |
+
n_dims = len(data.shape)
|
275 |
+
data_dtype = data.dtype
|
276 |
+
|
277 |
+
# if f32 desired, convert any float16 to float32
|
278 |
+
if ftype == 0 and data_dtype == np.float16:
|
279 |
+
data = data.astype(np.float32)
|
280 |
+
|
281 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
282 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
283 |
+
data = data.astype(np.float32)
|
284 |
+
|
285 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
286 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
287 |
+
data = data.astype(np.float16)
|
288 |
+
|
289 |
+
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
290 |
+
gguf_writer.add_tensor(new_name, data)
|
291 |
+
|
292 |
+
|
293 |
+
print("gguf: write header")
|
294 |
+
gguf_writer.write_header_to_file()
|
295 |
+
print("gguf: write metadata")
|
296 |
+
gguf_writer.write_kv_data_to_file()
|
297 |
+
if not args.vocab_only:
|
298 |
+
print("gguf: write tensors")
|
299 |
+
gguf_writer.write_tensors_to_file()
|
300 |
+
|
301 |
+
gguf_writer.close()
|
302 |
+
|
303 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
304 |
+
print("")
|
convert-falcon-hf-to-gguf.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF falcon--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
|
23 |
+
def bytes_to_unicode():
|
24 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
25 |
+
"""
|
26 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
27 |
+
The reversible bpe codes work on unicode strings.
|
28 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
29 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
30 |
+
This is a significant percentage of your normal, say, 32K bpe vocab.
|
31 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
32 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
33 |
+
"""
|
34 |
+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
35 |
+
cs = bs[:]
|
36 |
+
n = 0
|
37 |
+
for b in range(2**8):
|
38 |
+
if b not in bs:
|
39 |
+
bs.append(b)
|
40 |
+
cs.append(2**8+n)
|
41 |
+
n += 1
|
42 |
+
return dict(zip(bs, (chr(n) for n in cs)))
|
43 |
+
|
44 |
+
|
45 |
+
def count_model_parts(dir_model: Path) -> int:
|
46 |
+
num_parts = 0
|
47 |
+
for filename in os.listdir(dir_model):
|
48 |
+
if filename.startswith("pytorch_model-"):
|
49 |
+
num_parts += 1
|
50 |
+
|
51 |
+
if num_parts > 0:
|
52 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
53 |
+
return num_parts
|
54 |
+
|
55 |
+
|
56 |
+
def parse_args() -> argparse.Namespace:
|
57 |
+
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
|
58 |
+
parser.add_argument(
|
59 |
+
"--vocab-only", action="store_true",
|
60 |
+
help="extract only the vocab",
|
61 |
+
)
|
62 |
+
parser.add_argument(
|
63 |
+
"--outfile", type=Path,
|
64 |
+
help="path to write to; default: based on input",
|
65 |
+
)
|
66 |
+
parser.add_argument(
|
67 |
+
"model", type=Path,
|
68 |
+
help="directory containing model file, or model file itself (*.bin)",
|
69 |
+
)
|
70 |
+
parser.add_argument(
|
71 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
72 |
+
help="output format - use 0 for float32, 1 for float16",
|
73 |
+
)
|
74 |
+
return parser.parse_args()
|
75 |
+
|
76 |
+
args = parse_args()
|
77 |
+
|
78 |
+
dir_model = args.model
|
79 |
+
ftype = args.ftype
|
80 |
+
if not dir_model.is_dir():
|
81 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
82 |
+
sys.exit(1)
|
83 |
+
|
84 |
+
# possible tensor data types
|
85 |
+
# ftype == 0 -> float32
|
86 |
+
# ftype == 1 -> float16
|
87 |
+
|
88 |
+
# map from ftype to string
|
89 |
+
ftype_str = ["f32", "f16"]
|
90 |
+
|
91 |
+
if args.outfile is not None:
|
92 |
+
fname_out = args.outfile
|
93 |
+
else:
|
94 |
+
# output in the same directory as the model by default
|
95 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
96 |
+
|
97 |
+
print("gguf: loading model "+dir_model.name)
|
98 |
+
|
99 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
100 |
+
hparams = json.load(f)
|
101 |
+
|
102 |
+
if hparams["architectures"][0] != "RWForCausalLM":
|
103 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
104 |
+
|
105 |
+
sys.exit(1)
|
106 |
+
|
107 |
+
# get number of model parts
|
108 |
+
num_parts = count_model_parts(dir_model)
|
109 |
+
|
110 |
+
ARCH=gguf.MODEL_ARCH.FALCON
|
111 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
112 |
+
|
113 |
+
print("gguf: get model metadata")
|
114 |
+
|
115 |
+
block_count = hparams["n_layer"]
|
116 |
+
|
117 |
+
gguf_writer.add_name("Falcon")
|
118 |
+
gguf_writer.add_context_length(2048) # not in config.json
|
119 |
+
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
120 |
+
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
121 |
+
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
122 |
+
gguf_writer.add_block_count(block_count)
|
123 |
+
gguf_writer.add_head_count(hparams["n_head"])
|
124 |
+
if "n_head_kv" in hparams:
|
125 |
+
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
126 |
+
else:
|
127 |
+
gguf_writer.add_head_count_kv(1)
|
128 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
129 |
+
gguf_writer.add_file_type(ftype)
|
130 |
+
|
131 |
+
# TOKENIZATION
|
132 |
+
|
133 |
+
print("gguf: get tokenizer metadata")
|
134 |
+
|
135 |
+
tokens: list[bytearray] = []
|
136 |
+
scores: list[float] = []
|
137 |
+
toktypes: list[int] = []
|
138 |
+
|
139 |
+
tokenizer_json_file = dir_model / 'tokenizer.json'
|
140 |
+
if not tokenizer_json_file.is_file():
|
141 |
+
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
142 |
+
sys.exit(1)
|
143 |
+
|
144 |
+
# gpt2 tokenizer
|
145 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
146 |
+
|
147 |
+
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
148 |
+
tokenizer_json = json.load(f)
|
149 |
+
|
150 |
+
print("gguf: get gpt2 tokenizer vocab")
|
151 |
+
|
152 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
153 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
154 |
+
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
155 |
+
|
156 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
157 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
158 |
+
|
159 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
160 |
+
byte_encoder = bytes_to_unicode()
|
161 |
+
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
162 |
+
|
163 |
+
for i in range(vocab_size):
|
164 |
+
if i in reverse_vocab:
|
165 |
+
try:
|
166 |
+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
167 |
+
except KeyError:
|
168 |
+
text = bytearray()
|
169 |
+
for c in reverse_vocab[i]:
|
170 |
+
if ord(c) < 256: # single byte character
|
171 |
+
text.append(byte_decoder[ord(c)])
|
172 |
+
else: # multibyte special token character
|
173 |
+
text.extend(c.encode('utf-8'))
|
174 |
+
else:
|
175 |
+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
176 |
+
pad_token = f"[PAD{i}]".encode("utf8")
|
177 |
+
text = bytearray(pad_token)
|
178 |
+
|
179 |
+
tokens.append(text)
|
180 |
+
scores.append(0.0) # dymmy
|
181 |
+
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
182 |
+
|
183 |
+
gguf_writer.add_token_list(tokens)
|
184 |
+
gguf_writer.add_token_scores(scores)
|
185 |
+
gguf_writer.add_token_types(toktypes)
|
186 |
+
|
187 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
188 |
+
special_vocab.add_to_gguf(gguf_writer)
|
189 |
+
|
190 |
+
# TENSORS
|
191 |
+
|
192 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
193 |
+
|
194 |
+
# params for qkv transform
|
195 |
+
n_head = hparams["n_head"]
|
196 |
+
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
197 |
+
|
198 |
+
head_dim = hparams["hidden_size"] // n_head
|
199 |
+
|
200 |
+
# tensor info
|
201 |
+
print("gguf: get tensor metadata")
|
202 |
+
|
203 |
+
if num_parts == 0:
|
204 |
+
part_names = iter(("pytorch_model.bin",))
|
205 |
+
else:
|
206 |
+
part_names = (
|
207 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
208 |
+
)
|
209 |
+
|
210 |
+
for part_name in part_names:
|
211 |
+
if args.vocab_only:
|
212 |
+
break
|
213 |
+
print("gguf: loading model part '" + part_name + "'")
|
214 |
+
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
215 |
+
|
216 |
+
for name in model_part.keys():
|
217 |
+
data = model_part[name]
|
218 |
+
|
219 |
+
old_dtype = data.dtype
|
220 |
+
|
221 |
+
# convert any unsupported data types to float32
|
222 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
223 |
+
data = data.to(torch.float32)
|
224 |
+
|
225 |
+
# QKV tensor transform
|
226 |
+
# The original query_key_value tensor contains n_head_kv "kv groups",
|
227 |
+
# each consisting of n_head/n_head_kv query weights followed by one key
|
228 |
+
# and one value weight (shared by all query heads in the kv group).
|
229 |
+
# This layout makes it a big pain to work with in GGML.
|
230 |
+
# So we rearrange them here,, so that we have n_head query weights
|
231 |
+
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
232 |
+
# in contiguous fashion.
|
233 |
+
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
234 |
+
|
235 |
+
if "query_key_value" in name:
|
236 |
+
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
237 |
+
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
|
238 |
+
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
239 |
+
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
240 |
+
data = torch.cat((q,k,v)).reshape_as(data)
|
241 |
+
|
242 |
+
data = data.squeeze().numpy()
|
243 |
+
|
244 |
+
# map tensor names
|
245 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
246 |
+
if new_name is None:
|
247 |
+
print("Can not map tensor '" + name + "'")
|
248 |
+
sys.exit()
|
249 |
+
|
250 |
+
n_dims = len(data.shape)
|
251 |
+
data_dtype = data.dtype
|
252 |
+
|
253 |
+
# if f32 desired, convert any float16 to float32
|
254 |
+
if ftype == 0 and data_dtype == np.float16:
|
255 |
+
data = data.astype(np.float32)
|
256 |
+
|
257 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
258 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
259 |
+
data = data.astype(np.float32)
|
260 |
+
|
261 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
262 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
263 |
+
data = data.astype(np.float16)
|
264 |
+
|
265 |
+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
266 |
+
|
267 |
+
gguf_writer.add_tensor(new_name, data)
|
268 |
+
|
269 |
+
|
270 |
+
print("gguf: write header")
|
271 |
+
gguf_writer.write_header_to_file()
|
272 |
+
print("gguf: write metadata")
|
273 |
+
gguf_writer.write_kv_data_to_file()
|
274 |
+
if not args.vocab_only:
|
275 |
+
print("gguf: write tensors")
|
276 |
+
gguf_writer.write_tensors_to_file()
|
277 |
+
|
278 |
+
gguf_writer.close()
|
279 |
+
|
280 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
281 |
+
print("")
|
convert-gptneox-hf-to-gguf.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF gptneox--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
23 |
+
|
24 |
+
|
25 |
+
def bytes_to_unicode():
|
26 |
+
"""
|
27 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
28 |
+
The reversible bpe codes work on unicode strings.
|
29 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
30 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
31 |
+
This is a significant percentage of your normal, say, 32K bpe vocab.
|
32 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
33 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
34 |
+
"""
|
35 |
+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
36 |
+
cs = bs[:]
|
37 |
+
n = 0
|
38 |
+
for b in range(2**8):
|
39 |
+
if b not in bs:
|
40 |
+
bs.append(b)
|
41 |
+
cs.append(2**8+n)
|
42 |
+
n += 1
|
43 |
+
return dict(zip(bs, (chr(n) for n in cs)))
|
44 |
+
|
45 |
+
|
46 |
+
def count_model_parts(dir_model: Path) -> int:
|
47 |
+
num_parts = 0
|
48 |
+
for filename in os.listdir(dir_model):
|
49 |
+
if filename.startswith("pytorch_model-"):
|
50 |
+
num_parts += 1
|
51 |
+
|
52 |
+
if num_parts > 0:
|
53 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
54 |
+
return num_parts
|
55 |
+
|
56 |
+
|
57 |
+
def parse_args() -> argparse.Namespace:
|
58 |
+
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
|
59 |
+
parser.add_argument(
|
60 |
+
"--vocab-only", action="store_true",
|
61 |
+
help="extract only the vocab",
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"--outfile", type=Path,
|
65 |
+
help="path to write to; default: based on input",
|
66 |
+
)
|
67 |
+
parser.add_argument(
|
68 |
+
"model", type=Path,
|
69 |
+
help="directory containing model file, or model file itself (*.bin)",
|
70 |
+
)
|
71 |
+
parser.add_argument(
|
72 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
73 |
+
help="output format - use 0 for float32, 1 for float16",
|
74 |
+
)
|
75 |
+
return parser.parse_args()
|
76 |
+
|
77 |
+
args = parse_args()
|
78 |
+
|
79 |
+
dir_model = args.model
|
80 |
+
ftype = args.ftype
|
81 |
+
if not dir_model.is_dir():
|
82 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
83 |
+
sys.exit(1)
|
84 |
+
|
85 |
+
# possible tensor data types
|
86 |
+
# ftype == 0 -> float32
|
87 |
+
# ftype == 1 -> float16
|
88 |
+
|
89 |
+
# map from ftype to string
|
90 |
+
ftype_str = ["f32", "f16"]
|
91 |
+
|
92 |
+
if args.outfile is not None:
|
93 |
+
fname_out = args.outfile
|
94 |
+
else:
|
95 |
+
# output in the same directory as the model by default
|
96 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
97 |
+
|
98 |
+
print("gguf: loading model "+dir_model.name)
|
99 |
+
|
100 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
101 |
+
hparams = json.load(f)
|
102 |
+
|
103 |
+
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
|
104 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
105 |
+
|
106 |
+
sys.exit()
|
107 |
+
|
108 |
+
# get number of model parts
|
109 |
+
num_parts = count_model_parts(dir_model)
|
110 |
+
|
111 |
+
ARCH=gguf.MODEL_ARCH.GPTNEOX
|
112 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
113 |
+
|
114 |
+
print("gguf: get model metadata")
|
115 |
+
|
116 |
+
block_count = hparams["num_hidden_layers"]
|
117 |
+
|
118 |
+
gguf_writer.add_name(dir_model.name)
|
119 |
+
gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
120 |
+
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
121 |
+
gguf_writer.add_block_count(block_count)
|
122 |
+
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
123 |
+
gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
|
124 |
+
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
125 |
+
gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
126 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
127 |
+
|
128 |
+
# TOKENIZATION
|
129 |
+
|
130 |
+
print("gguf: get tokenizer metadata")
|
131 |
+
|
132 |
+
tokens: list[bytearray] = []
|
133 |
+
|
134 |
+
tokenizer_json_file = dir_model / 'tokenizer.json'
|
135 |
+
if not tokenizer_json_file.is_file():
|
136 |
+
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
137 |
+
sys.exit(1)
|
138 |
+
|
139 |
+
# gpt2 tokenizer
|
140 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
141 |
+
|
142 |
+
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
143 |
+
tokenizer_json = json.load(f)
|
144 |
+
|
145 |
+
print("gguf: get gpt2 tokenizer vocab")
|
146 |
+
|
147 |
+
vocab_size = len(tokenizer_json["model"]["vocab"])
|
148 |
+
|
149 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
150 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
151 |
+
|
152 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
153 |
+
byte_encoder = bytes_to_unicode()
|
154 |
+
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
155 |
+
|
156 |
+
for i in range(vocab_size):
|
157 |
+
if i in reverse_vocab:
|
158 |
+
try:
|
159 |
+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
160 |
+
except KeyError:
|
161 |
+
text = bytearray()
|
162 |
+
for c in reverse_vocab[i]:
|
163 |
+
if ord(c) < 256: # single byte character
|
164 |
+
text.append(byte_decoder[ord(c)])
|
165 |
+
else: # multibyte special token character
|
166 |
+
text.extend(c.encode('utf-8'))
|
167 |
+
else:
|
168 |
+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
169 |
+
pad_token = f"[PAD{i}]".encode("utf8")
|
170 |
+
text = bytearray(pad_token)
|
171 |
+
|
172 |
+
tokens.append(text)
|
173 |
+
|
174 |
+
gguf_writer.add_token_list(tokens)
|
175 |
+
|
176 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
177 |
+
special_vocab.add_to_gguf(gguf_writer)
|
178 |
+
|
179 |
+
# TENSORS
|
180 |
+
|
181 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
182 |
+
|
183 |
+
# tensor info
|
184 |
+
print("gguf: get tensor metadata")
|
185 |
+
|
186 |
+
if num_parts == 0:
|
187 |
+
part_names = iter(("pytorch_model.bin",))
|
188 |
+
else:
|
189 |
+
part_names = (
|
190 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
191 |
+
)
|
192 |
+
|
193 |
+
for part_name in part_names:
|
194 |
+
if args.vocab_only:
|
195 |
+
break
|
196 |
+
print("gguf: loading model part '" + part_name + "'")
|
197 |
+
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
198 |
+
|
199 |
+
for name in model_part.keys():
|
200 |
+
data = model_part[name]
|
201 |
+
|
202 |
+
# we don't need these
|
203 |
+
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
|
204 |
+
continue
|
205 |
+
|
206 |
+
old_dtype = data.dtype
|
207 |
+
|
208 |
+
# convert any unsupported data types to float32
|
209 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
210 |
+
data = data.to(torch.float32)
|
211 |
+
|
212 |
+
data = data.squeeze().numpy()
|
213 |
+
|
214 |
+
# map tensor names
|
215 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
216 |
+
if new_name is None:
|
217 |
+
print("Can not map tensor '" + name + "'")
|
218 |
+
sys.exit()
|
219 |
+
|
220 |
+
n_dims = len(data.shape)
|
221 |
+
data_dtype = data.dtype
|
222 |
+
|
223 |
+
# if f32 desired, convert any float16 to float32
|
224 |
+
if ftype == 0 and data_dtype == np.float16:
|
225 |
+
data = data.astype(np.float32)
|
226 |
+
|
227 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
228 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
229 |
+
data = data.astype(np.float32)
|
230 |
+
|
231 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
232 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
233 |
+
data = data.astype(np.float16)
|
234 |
+
|
235 |
+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
236 |
+
|
237 |
+
gguf_writer.add_tensor(new_name, data)
|
238 |
+
|
239 |
+
|
240 |
+
print("gguf: write header")
|
241 |
+
gguf_writer.write_header_to_file()
|
242 |
+
print("gguf: write metadata")
|
243 |
+
gguf_writer.write_kv_data_to_file()
|
244 |
+
if not args.vocab_only:
|
245 |
+
print("gguf: write tensors")
|
246 |
+
gguf_writer.write_tensors_to_file()
|
247 |
+
|
248 |
+
gguf_writer.close()
|
249 |
+
|
250 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
251 |
+
print("")
|
convert-llama-ggml-to-gguf.py
ADDED
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import math
|
6 |
+
import struct
|
7 |
+
import sys
|
8 |
+
from enum import IntEnum
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
import os
|
14 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
15 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
16 |
+
import gguf
|
17 |
+
|
18 |
+
# Note: Does not support GGML_QKK_64
|
19 |
+
QK_K = 256
|
20 |
+
# Items here are (block size, type size)
|
21 |
+
GGML_QUANT_SIZES = {
|
22 |
+
gguf.GGMLQuantizationType.F32 : (1, 4),
|
23 |
+
gguf.GGMLQuantizationType.F16 : (1, 2),
|
24 |
+
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
|
25 |
+
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
|
26 |
+
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
|
27 |
+
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
|
28 |
+
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
|
29 |
+
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
|
30 |
+
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
31 |
+
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
32 |
+
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
|
33 |
+
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
34 |
+
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
35 |
+
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
36 |
+
}
|
37 |
+
|
38 |
+
class GGMLFormat(IntEnum):
|
39 |
+
GGML = 0
|
40 |
+
GGMF = 1
|
41 |
+
GGJT = 2
|
42 |
+
|
43 |
+
class GGMLFType(IntEnum):
|
44 |
+
ALL_F32 = 0
|
45 |
+
MOSTLY_F16 = 1
|
46 |
+
MOSTLY_Q4_0 = 2
|
47 |
+
MOSTLY_Q4_1 = 3
|
48 |
+
MOSTLY_Q4_1_SOME_F16 = 4
|
49 |
+
MOSTLY_Q8_0 = 7
|
50 |
+
MOSTLY_Q5_0 = 8
|
51 |
+
MOSTLY_Q5_1 = 9
|
52 |
+
MOSTLY_Q2_K = 10
|
53 |
+
MOSTLY_Q3_K_S = 11
|
54 |
+
MOSTLY_Q3_K_M = 12
|
55 |
+
MOSTLY_Q3_K_L = 13
|
56 |
+
MOSTLY_Q4_K_S = 14
|
57 |
+
MOSTLY_Q4_K_M = 15
|
58 |
+
MOSTLY_Q5_K_S = 16
|
59 |
+
MOSTLY_Q5_K_M = 17
|
60 |
+
MOSTLY_Q6_K = 18
|
61 |
+
|
62 |
+
class Hyperparameters:
|
63 |
+
def __init__(self):
|
64 |
+
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
65 |
+
self.n_layer = self.n_rot = self.n_ff = 0
|
66 |
+
self.ftype = GGMLFType.ALL_F32
|
67 |
+
|
68 |
+
def set_n_ff(self, model):
|
69 |
+
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
|
70 |
+
assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
|
71 |
+
ff_tensor = model.tensors[ff_tensor_idx]
|
72 |
+
self.n_ff = ff_tensor.dims[1]
|
73 |
+
|
74 |
+
def load(self, data, offset):
|
75 |
+
(
|
76 |
+
self.n_vocab,
|
77 |
+
self.n_embd,
|
78 |
+
self.n_mult,
|
79 |
+
self.n_head,
|
80 |
+
self.n_layer,
|
81 |
+
self.n_rot,
|
82 |
+
ftype,
|
83 |
+
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
|
84 |
+
try:
|
85 |
+
self.ftype = GGMLFType(ftype)
|
86 |
+
except ValueError:
|
87 |
+
raise ValueError(f'Invalid ftype {ftype}')
|
88 |
+
return 4 * 7
|
89 |
+
|
90 |
+
def __str__(self):
|
91 |
+
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
92 |
+
|
93 |
+
class Vocab:
|
94 |
+
def __init__(self, load_scores = True):
|
95 |
+
self.items = []
|
96 |
+
self.load_scores = load_scores
|
97 |
+
|
98 |
+
def load(self, data, offset, n_vocab):
|
99 |
+
orig_offset = offset
|
100 |
+
for _ in range(n_vocab):
|
101 |
+
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
|
102 |
+
assert itemlen < 4096, 'Absurd vocab item length'
|
103 |
+
offset += 4
|
104 |
+
item_text = bytes(data[offset:offset + itemlen])
|
105 |
+
offset += itemlen
|
106 |
+
if self.load_scores:
|
107 |
+
item_score = struct.unpack('<f', data[offset:offset + 4])[0]
|
108 |
+
offset += 4
|
109 |
+
else:
|
110 |
+
item_score = 0.0
|
111 |
+
self.items.append((item_text, item_score))
|
112 |
+
return offset - orig_offset
|
113 |
+
|
114 |
+
class Tensor:
|
115 |
+
def __init__(self, use_padding = True):
|
116 |
+
self.name = None
|
117 |
+
self.dims: tuple[int, ...] = ()
|
118 |
+
self.dtype = None
|
119 |
+
self.start_offset = 0
|
120 |
+
self.len_bytes = np.int64(0)
|
121 |
+
self.use_padding = use_padding
|
122 |
+
|
123 |
+
def load(self, data, offset):
|
124 |
+
orig_offset = offset
|
125 |
+
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
|
126 |
+
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
|
127 |
+
assert name_len < 4096, 'Absurd tensor name length'
|
128 |
+
quant = GGML_QUANT_SIZES.get(dtype)
|
129 |
+
assert quant is not None, 'Unknown tensor type'
|
130 |
+
(blksize, tysize) = quant
|
131 |
+
offset += 12
|
132 |
+
self.dtype= dtype
|
133 |
+
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
|
134 |
+
offset += 4 * n_dims
|
135 |
+
self.name = bytes(data[offset:offset + name_len])
|
136 |
+
offset += name_len
|
137 |
+
pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
|
138 |
+
offset += pad
|
139 |
+
n_elems = np.prod(self.dims)
|
140 |
+
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
141 |
+
self.start_offset = offset
|
142 |
+
self.len_bytes = n_bytes
|
143 |
+
offset += n_bytes
|
144 |
+
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
145 |
+
return offset - orig_offset
|
146 |
+
|
147 |
+
class GGMLModel:
|
148 |
+
def __init__(self):
|
149 |
+
self.hyperparameters = None
|
150 |
+
self.vocab = None
|
151 |
+
self.tensor_map = {}
|
152 |
+
self.tensors = []
|
153 |
+
|
154 |
+
def validate_header(self, data, offset):
|
155 |
+
magic = bytes(data[offset:offset + 4])
|
156 |
+
if magic == b'GGUF':
|
157 |
+
raise ValueError('File is already in GGUF format.')
|
158 |
+
if magic == b'lmgg':
|
159 |
+
self.file_format = GGMLFormat.GGML
|
160 |
+
self.format_version = 1
|
161 |
+
return 4
|
162 |
+
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
|
163 |
+
if magic == b'fmgg':
|
164 |
+
if version != 1:
|
165 |
+
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
|
166 |
+
self.file_format = GGMLFormat.GGMF
|
167 |
+
self.format_version = version
|
168 |
+
return 8
|
169 |
+
if magic == b'tjgg':
|
170 |
+
if version < 1 or version > 3:
|
171 |
+
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
|
172 |
+
self.file_format = GGMLFormat.GGJT
|
173 |
+
self.format_version = version
|
174 |
+
return 8
|
175 |
+
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
|
176 |
+
|
177 |
+
def validate_conversion(self, ftype):
|
178 |
+
err = ''
|
179 |
+
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
|
180 |
+
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
181 |
+
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
182 |
+
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
183 |
+
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
184 |
+
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
185 |
+
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
186 |
+
if len(err) > 0:
|
187 |
+
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
188 |
+
|
189 |
+
def load(self, data, offset):
|
190 |
+
offset += self.validate_header(data, offset)
|
191 |
+
hp = Hyperparameters()
|
192 |
+
offset += hp.load(data, offset)
|
193 |
+
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
|
194 |
+
self.validate_conversion(hp.ftype)
|
195 |
+
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
|
196 |
+
offset += vocab.load(data, offset, hp.n_vocab)
|
197 |
+
tensors: list[Tensor] = []
|
198 |
+
tensor_map = {}
|
199 |
+
while offset < len(data):
|
200 |
+
tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
|
201 |
+
offset += tensor.load(data, offset)
|
202 |
+
tensor_map[tensor.name] = len(tensors)
|
203 |
+
tensors.append(tensor)
|
204 |
+
self.hyperparameters = hp
|
205 |
+
self.vocab = vocab
|
206 |
+
self.tensors = tensors
|
207 |
+
self.tensor_map = tensor_map
|
208 |
+
hp.set_n_ff(self)
|
209 |
+
return offset
|
210 |
+
|
211 |
+
class GGMLToGGUF:
|
212 |
+
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
213 |
+
hp = ggml_model.hyperparameters
|
214 |
+
self.model = ggml_model
|
215 |
+
self.data = data
|
216 |
+
self.cfg = cfg
|
217 |
+
self.params_override = params_override
|
218 |
+
self.vocab_override = vocab_override
|
219 |
+
self.special_vocab = special_vocab
|
220 |
+
if params_override is not None:
|
221 |
+
n_kv_head = params_override.n_head_kv
|
222 |
+
else:
|
223 |
+
if cfg.gqa == 1:
|
224 |
+
n_kv_head = hp.n_head
|
225 |
+
else:
|
226 |
+
gqa = float(cfg.gqa)
|
227 |
+
n_kv_head = None
|
228 |
+
for x in range(1, 256):
|
229 |
+
if float(hp.n_head) / float(x) == gqa:
|
230 |
+
n_kv_head = x
|
231 |
+
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
|
232 |
+
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
233 |
+
self.n_kv_head = n_kv_head
|
234 |
+
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
|
235 |
+
|
236 |
+
def save(self):
|
237 |
+
print('* Preparing to save GGUF file')
|
238 |
+
gguf_writer = gguf.GGUFWriter(
|
239 |
+
self.cfg.output,
|
240 |
+
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
241 |
+
use_temp_file = False )
|
242 |
+
self.add_params(gguf_writer)
|
243 |
+
self.add_vocab(gguf_writer)
|
244 |
+
if self.special_vocab is not None:
|
245 |
+
self.special_vocab.add_to_gguf(gguf_writer)
|
246 |
+
self.add_tensors(gguf_writer)
|
247 |
+
print(" gguf: write header")
|
248 |
+
gguf_writer.write_header_to_file()
|
249 |
+
print(" gguf: write metadata")
|
250 |
+
gguf_writer.write_kv_data_to_file()
|
251 |
+
print(" gguf: write tensors")
|
252 |
+
gguf_writer.write_tensors_to_file()
|
253 |
+
gguf_writer.close()
|
254 |
+
|
255 |
+
def add_params(self, gguf_writer):
|
256 |
+
hp = self.model.hyperparameters
|
257 |
+
cfg = self.cfg
|
258 |
+
if cfg.desc is not None:
|
259 |
+
desc = cfg.desc
|
260 |
+
else:
|
261 |
+
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
|
262 |
+
try:
|
263 |
+
# Filenames aren't necessarily valid UTF8.
|
264 |
+
name = cfg.name if cfg.name is not None else cfg.input.name
|
265 |
+
except UnicodeDecodeError:
|
266 |
+
name = None
|
267 |
+
print('* Adding model parameters and KV items')
|
268 |
+
if name is not None:
|
269 |
+
gguf_writer.add_name(name)
|
270 |
+
gguf_writer.add_description(desc)
|
271 |
+
gguf_writer.add_file_type(int(hp.ftype))
|
272 |
+
if self.params_override is not None:
|
273 |
+
po = self.params_override
|
274 |
+
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
|
275 |
+
assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
|
276 |
+
assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
|
277 |
+
gguf_writer.add_context_length (po.n_ctx)
|
278 |
+
gguf_writer.add_embedding_length (po.n_embd)
|
279 |
+
gguf_writer.add_block_count (po.n_layer)
|
280 |
+
gguf_writer.add_feed_forward_length (po.n_ff)
|
281 |
+
gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
|
282 |
+
gguf_writer.add_head_count (po.n_head)
|
283 |
+
gguf_writer.add_head_count_kv (po.n_head_kv)
|
284 |
+
gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
|
285 |
+
return
|
286 |
+
gguf_writer.add_context_length(cfg.context_length)
|
287 |
+
gguf_writer.add_embedding_length(hp.n_embd)
|
288 |
+
gguf_writer.add_block_count(hp.n_layer)
|
289 |
+
gguf_writer.add_feed_forward_length(hp.n_ff)
|
290 |
+
gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
|
291 |
+
gguf_writer.add_head_count(hp.n_head)
|
292 |
+
gguf_writer.add_head_count_kv(self.n_kv_head)
|
293 |
+
gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
|
294 |
+
|
295 |
+
def add_vocab(self, gguf_writer):
|
296 |
+
hp = self.model.hyperparameters
|
297 |
+
gguf_writer.add_tokenizer_model('llama')
|
298 |
+
tokens = []
|
299 |
+
scores = []
|
300 |
+
toktypes = []
|
301 |
+
if self.vocab_override is not None:
|
302 |
+
vo = self.vocab_override
|
303 |
+
print('* Adding vocab item(s)')
|
304 |
+
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
305 |
+
tokens.append(vbytes)
|
306 |
+
scores.append(score)
|
307 |
+
toktypes.append(ttype)
|
308 |
+
assert len(tokens) == hp.n_vocab, \
|
309 |
+
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
310 |
+
gguf_writer.add_token_list(tokens)
|
311 |
+
gguf_writer.add_token_scores(scores)
|
312 |
+
if len(toktypes) > 0:
|
313 |
+
gguf_writer.add_token_types(toktypes)
|
314 |
+
return
|
315 |
+
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
316 |
+
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
|
317 |
+
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
318 |
+
tt = 1 # Normal
|
319 |
+
# Special handling for UNK, BOS, EOS tokens.
|
320 |
+
if tokid <= 2:
|
321 |
+
if tokid == 0:
|
322 |
+
vbytes = b'<unk>'
|
323 |
+
tt = 2
|
324 |
+
elif tokid == 1:
|
325 |
+
vbytes = b'<s>'
|
326 |
+
tt = 3
|
327 |
+
else:
|
328 |
+
vbytes = b'</s>'
|
329 |
+
tt = 3
|
330 |
+
elif len(vbytes) == 0:
|
331 |
+
tt = 3 # Control
|
332 |
+
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
333 |
+
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
|
334 |
+
tt = 6 # Byte
|
335 |
+
else:
|
336 |
+
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
|
337 |
+
toktypes.append(tt)
|
338 |
+
tokens.append(vbytes)
|
339 |
+
scores.append(vscore)
|
340 |
+
gguf_writer.add_token_list(tokens)
|
341 |
+
gguf_writer.add_token_scores(scores)
|
342 |
+
gguf_writer.add_token_types(toktypes)
|
343 |
+
gguf_writer.add_unk_token_id(0)
|
344 |
+
gguf_writer.add_bos_token_id(1)
|
345 |
+
gguf_writer.add_eos_token_id(2)
|
346 |
+
|
347 |
+
def add_tensors(self, gguf_writer):
|
348 |
+
tensor_map = self.name_map
|
349 |
+
data = self.data
|
350 |
+
print(f'* Adding {len(self.model.tensors)} tensor(s)')
|
351 |
+
for tensor in self.model.tensors:
|
352 |
+
name = str(tensor.name, 'UTF-8')
|
353 |
+
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
354 |
+
assert mapped_name is not None, f'Bad name {name}'
|
355 |
+
tempdims = list(tensor.dims[:])
|
356 |
+
if len(tempdims) > 1:
|
357 |
+
temp = tempdims[1]
|
358 |
+
tempdims[1] = tempdims[0]
|
359 |
+
tempdims[0] = temp
|
360 |
+
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
361 |
+
gguf_writer.add_tensor(
|
362 |
+
mapped_name,
|
363 |
+
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
364 |
+
raw_shape = tempdims,
|
365 |
+
raw_dtype = tensor.dtype )
|
366 |
+
|
367 |
+
def handle_metadata(cfg, hp):
|
368 |
+
import convert
|
369 |
+
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
370 |
+
hf_config_path = cfg.model_metadata_dir / "config.json"
|
371 |
+
orig_config_path = cfg.model_metadata_dir / "params.json"
|
372 |
+
# We pass a fake model here. "original" mode will check the shapes of some
|
373 |
+
# tensors if information is missing in the .json file: other than that, the
|
374 |
+
# model data isn't used so this should be safe (at least for now).
|
375 |
+
fakemodel = {
|
376 |
+
'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
|
377 |
+
'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
|
378 |
+
}
|
379 |
+
fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
|
380 |
+
fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
|
381 |
+
if hf_config_path.exists():
|
382 |
+
params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
|
383 |
+
elif orig_config_path.exists():
|
384 |
+
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
385 |
+
else:
|
386 |
+
raise ValueError('Unable to load metadata')
|
387 |
+
vocab = convert.load_vocab(
|
388 |
+
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
389 |
+
cfg.vocabtype )
|
390 |
+
# FIXME: Respect cfg.vocab_dir?
|
391 |
+
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
392 |
+
convert.check_vocab_size(params, vocab)
|
393 |
+
return (params, vocab, svocab)
|
394 |
+
|
395 |
+
def handle_args():
|
396 |
+
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
397 |
+
parser.add_argument('--input', '-i', type = Path, required = True,
|
398 |
+
help = 'Input GGMLv3 filename')
|
399 |
+
parser.add_argument('--output', '-o', type = Path, required = True,
|
400 |
+
help ='Output GGUF filename')
|
401 |
+
parser.add_argument('--name',
|
402 |
+
help = 'Set model name')
|
403 |
+
parser.add_argument('--desc',
|
404 |
+
help = 'Set model description')
|
405 |
+
parser.add_argument('--gqa', type = int, default = 1,
|
406 |
+
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
407 |
+
parser.add_argument('--eps', default = '5.0e-06',
|
408 |
+
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
409 |
+
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
410 |
+
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
411 |
+
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
412 |
+
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
413 |
+
parser.add_argument("--vocab-dir", type=Path,
|
414 |
+
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
415 |
+
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
416 |
+
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
417 |
+
return parser.parse_args()
|
418 |
+
|
419 |
+
def main():
|
420 |
+
cfg = handle_args()
|
421 |
+
print(f'* Using config: {cfg}')
|
422 |
+
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
423 |
+
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
|
424 |
+
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
|
425 |
+
data = np.memmap(cfg.input, mode = 'r')
|
426 |
+
model = GGMLModel()
|
427 |
+
print('* Scanning GGML input file')
|
428 |
+
offset = model.load(data, 0)
|
429 |
+
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
430 |
+
vocab_override = None
|
431 |
+
params_override = None
|
432 |
+
special_vocab = None
|
433 |
+
if cfg.model_metadata_dir is not None:
|
434 |
+
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
|
435 |
+
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
436 |
+
print(f'* Overriding params: {params_override}')
|
437 |
+
print(f'* Overriding vocab: {vocab_override}')
|
438 |
+
print(f'* Special vocab: {special_vocab}')
|
439 |
+
else:
|
440 |
+
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
441 |
+
if model.file_format == GGMLFormat.GGML:
|
442 |
+
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
443 |
+
converter = GGMLToGGUF(model, data, cfg,
|
444 |
+
params_override = params_override,
|
445 |
+
vocab_override = vocab_override,
|
446 |
+
special_vocab = special_vocab )
|
447 |
+
converter.save()
|
448 |
+
print(f'* Successful completion. Output saved to: {cfg.output}')
|
449 |
+
|
450 |
+
if __name__ == '__main__':
|
451 |
+
main()
|
convert-lora-to-ggml.py
CHANGED
@@ -1,28 +1,29 @@
|
|
1 |
-
#!/usr/bin/env
|
|
|
|
|
2 |
import json
|
3 |
import os
|
4 |
import re
|
5 |
import struct
|
6 |
import sys
|
7 |
-
from typing import Any,
|
8 |
|
|
|
9 |
import torch
|
10 |
|
11 |
-
|
|
|
12 |
|
13 |
HF_SUBLAYER_TO_GGML = {
|
14 |
-
"self_attn.q_proj": "
|
15 |
-
"self_attn.k_proj": "
|
16 |
-
"self_attn.v_proj": "
|
17 |
-
"self_attn.o_proj": "
|
18 |
-
"mlp.gate_proj": "
|
19 |
-
"mlp.down_proj": "
|
20 |
-
"mlp.up_proj": "
|
21 |
-
"input_layernorm": "
|
22 |
"post_attention_layernorm": "ffn_norm",
|
23 |
-
# "norm": "norm",
|
24 |
-
# "embed_tokens": "tok_embeddings",
|
25 |
-
# "lm_head": "output",
|
26 |
}
|
27 |
|
28 |
|
@@ -39,7 +40,7 @@ def translate_tensor_name(t: str) -> str:
|
|
39 |
sys.exit(1)
|
40 |
|
41 |
output_string = (
|
42 |
-
f"
|
43 |
)
|
44 |
return output_string
|
45 |
else:
|
@@ -47,19 +48,21 @@ def translate_tensor_name(t: str) -> str:
|
|
47 |
sys.exit(1)
|
48 |
|
49 |
|
50 |
-
def write_file_header(fout:
|
51 |
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
52 |
fout.write(struct.pack("i", 1)) # file version
|
53 |
fout.write(struct.pack("i", params["r"]))
|
54 |
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
55 |
# but some models ship a float value instead
|
56 |
# let's convert to int, but fail if lossless conversion is not possible
|
57 |
-
assert
|
|
|
|
|
58 |
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
59 |
|
60 |
|
61 |
def write_tensor_header(
|
62 |
-
self, name: str, shape: Sequence[int], data_type:
|
63 |
) -> None:
|
64 |
sname = name.encode("utf-8")
|
65 |
fout.write(
|
@@ -67,7 +70,7 @@ def write_tensor_header(
|
|
67 |
"iii",
|
68 |
len(shape),
|
69 |
len(sname),
|
70 |
-
|
71 |
)
|
72 |
)
|
73 |
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
import json
|
5 |
import os
|
6 |
import re
|
7 |
import struct
|
8 |
import sys
|
9 |
+
from typing import Any, BinaryIO, Sequence
|
10 |
|
11 |
+
import numpy as np
|
12 |
import torch
|
13 |
|
14 |
+
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
15 |
+
|
16 |
|
17 |
HF_SUBLAYER_TO_GGML = {
|
18 |
+
"self_attn.q_proj": "attn_q",
|
19 |
+
"self_attn.k_proj": "attn_k",
|
20 |
+
"self_attn.v_proj": "attn_v",
|
21 |
+
"self_attn.o_proj": "attn_output",
|
22 |
+
"mlp.gate_proj": "ffn_gate",
|
23 |
+
"mlp.down_proj": "ffn_down",
|
24 |
+
"mlp.up_proj": "ffn_up",
|
25 |
+
"input_layernorm": "attn_norm",
|
26 |
"post_attention_layernorm": "ffn_norm",
|
|
|
|
|
|
|
27 |
}
|
28 |
|
29 |
|
|
|
40 |
sys.exit(1)
|
41 |
|
42 |
output_string = (
|
43 |
+
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
44 |
)
|
45 |
return output_string
|
46 |
else:
|
|
|
48 |
sys.exit(1)
|
49 |
|
50 |
|
51 |
+
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
52 |
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
53 |
fout.write(struct.pack("i", 1)) # file version
|
54 |
fout.write(struct.pack("i", params["r"]))
|
55 |
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
56 |
# but some models ship a float value instead
|
57 |
# let's convert to int, but fail if lossless conversion is not possible
|
58 |
+
assert (
|
59 |
+
int(params["lora_alpha"]) == params["lora_alpha"]
|
60 |
+
), "cannot convert float to int losslessly"
|
61 |
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
62 |
|
63 |
|
64 |
def write_tensor_header(
|
65 |
+
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
|
66 |
) -> None:
|
67 |
sname = name.encode("utf-8")
|
68 |
fout.write(
|
|
|
70 |
"iii",
|
71 |
len(shape),
|
72 |
len(sname),
|
73 |
+
NUMPY_TYPE_TO_FTYPE[data_type.name],
|
74 |
)
|
75 |
)
|
76 |
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
convert-starcoder-hf-to-gguf.py
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF starcoder --> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
|
23 |
+
def bytes_to_unicode():
|
24 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
25 |
+
"""
|
26 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
27 |
+
The reversible bpe codes work on unicode strings.
|
28 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
29 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
30 |
+
This is a significant percentage of your normal, say, 32K bpe vocab.
|
31 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
32 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
33 |
+
"""
|
34 |
+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
35 |
+
cs = bs[:]
|
36 |
+
n = 0
|
37 |
+
for b in range(2**8):
|
38 |
+
if b not in bs:
|
39 |
+
bs.append(b)
|
40 |
+
cs.append(2**8+n)
|
41 |
+
n += 1
|
42 |
+
return dict(zip(bs, (chr(n) for n in cs)))
|
43 |
+
|
44 |
+
|
45 |
+
def count_model_parts(dir_model: Path) -> int:
|
46 |
+
num_parts = 0
|
47 |
+
for filename in os.listdir(dir_model):
|
48 |
+
if filename.startswith("pytorch_model-"):
|
49 |
+
num_parts += 1
|
50 |
+
|
51 |
+
if num_parts > 0:
|
52 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
53 |
+
return num_parts
|
54 |
+
|
55 |
+
|
56 |
+
def parse_args() -> argparse.Namespace:
|
57 |
+
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
|
58 |
+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
59 |
+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
60 |
+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
61 |
+
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
62 |
+
return parser.parse_args()
|
63 |
+
|
64 |
+
args = parse_args()
|
65 |
+
|
66 |
+
dir_model = args.model
|
67 |
+
ftype = args.ftype
|
68 |
+
if not dir_model.is_dir():
|
69 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
70 |
+
sys.exit(1)
|
71 |
+
|
72 |
+
# possible tensor data types
|
73 |
+
# ftype == 0 -> float32
|
74 |
+
# ftype == 1 -> float16
|
75 |
+
|
76 |
+
# map from ftype to string
|
77 |
+
ftype_str = ["f32", "f16"]
|
78 |
+
|
79 |
+
if args.outfile is not None:
|
80 |
+
fname_out = args.outfile
|
81 |
+
else:
|
82 |
+
# output in the same directory as the model by default
|
83 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
84 |
+
|
85 |
+
print("gguf: loading model "+dir_model.name)
|
86 |
+
|
87 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
88 |
+
hparams = json.load(f)
|
89 |
+
|
90 |
+
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
|
91 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
92 |
+
|
93 |
+
sys.exit(1)
|
94 |
+
|
95 |
+
# get number of model parts
|
96 |
+
num_parts = count_model_parts(dir_model)
|
97 |
+
|
98 |
+
ARCH=gguf.MODEL_ARCH.STARCODER
|
99 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
100 |
+
|
101 |
+
print("gguf: get model metadata")
|
102 |
+
|
103 |
+
block_count = hparams["n_layer"]
|
104 |
+
|
105 |
+
gguf_writer.add_name("StarCoder")
|
106 |
+
gguf_writer.add_context_length(hparams["n_positions"])
|
107 |
+
gguf_writer.add_embedding_length(hparams["n_embd"])
|
108 |
+
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
|
109 |
+
gguf_writer.add_block_count(block_count)
|
110 |
+
gguf_writer.add_head_count(hparams["n_head"])
|
111 |
+
gguf_writer.add_head_count_kv(1)
|
112 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
113 |
+
gguf_writer.add_file_type(ftype)
|
114 |
+
|
115 |
+
# TOKENIZATION
|
116 |
+
|
117 |
+
print("gguf: get tokenizer metadata")
|
118 |
+
|
119 |
+
tokens: list[bytearray] = []
|
120 |
+
scores: list[float] = []
|
121 |
+
toktypes: list[int] = []
|
122 |
+
|
123 |
+
tokenizer_json_file = dir_model / 'tokenizer.json'
|
124 |
+
if not tokenizer_json_file.is_file():
|
125 |
+
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
126 |
+
sys.exit(1)
|
127 |
+
|
128 |
+
# gpt2 tokenizer
|
129 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
130 |
+
|
131 |
+
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
132 |
+
tokenizer_json = json.load(f)
|
133 |
+
|
134 |
+
print("gguf: get gpt2 tokenizer vocab")
|
135 |
+
|
136 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
137 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
138 |
+
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
139 |
+
|
140 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
141 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
142 |
+
|
143 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
144 |
+
byte_encoder = bytes_to_unicode()
|
145 |
+
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
146 |
+
|
147 |
+
for i in range(vocab_size):
|
148 |
+
if i in reverse_vocab:
|
149 |
+
try:
|
150 |
+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
151 |
+
except KeyError:
|
152 |
+
text = bytearray()
|
153 |
+
for c in reverse_vocab[i]:
|
154 |
+
if ord(c) < 256: # single byte character
|
155 |
+
text.append(byte_decoder[ord(c)])
|
156 |
+
else: # multibyte special token character
|
157 |
+
text.extend(c.encode('utf-8'))
|
158 |
+
else:
|
159 |
+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
160 |
+
pad_token = f"[PAD{i}]".encode("utf8")
|
161 |
+
text = bytearray(pad_token)
|
162 |
+
|
163 |
+
tokens.append(text)
|
164 |
+
scores.append(0.0) # dymmy
|
165 |
+
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
166 |
+
|
167 |
+
gguf_writer.add_token_list(tokens)
|
168 |
+
gguf_writer.add_token_scores(scores)
|
169 |
+
gguf_writer.add_token_types(toktypes)
|
170 |
+
|
171 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
172 |
+
special_vocab.add_to_gguf(gguf_writer)
|
173 |
+
|
174 |
+
# TENSORS
|
175 |
+
|
176 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
177 |
+
|
178 |
+
# params for qkv transform
|
179 |
+
n_head = hparams["n_head"]
|
180 |
+
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
181 |
+
|
182 |
+
head_dim = hparams["n_embd"] // n_head
|
183 |
+
|
184 |
+
# tensor info
|
185 |
+
print("gguf: get tensor metadata")
|
186 |
+
|
187 |
+
if num_parts == 0:
|
188 |
+
part_names = iter(("pytorch_model.bin",))
|
189 |
+
else:
|
190 |
+
part_names = (
|
191 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
192 |
+
)
|
193 |
+
|
194 |
+
for part_name in part_names:
|
195 |
+
if args.vocab_only:
|
196 |
+
break
|
197 |
+
print("gguf: loading model part '" + part_name + "'")
|
198 |
+
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
199 |
+
|
200 |
+
for name in model_part.keys():
|
201 |
+
data = model_part[name]
|
202 |
+
|
203 |
+
old_dtype = data.dtype
|
204 |
+
|
205 |
+
# convert any unsupported data types to float32
|
206 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
207 |
+
data = data.to(torch.float32)
|
208 |
+
|
209 |
+
data = data.squeeze().numpy()
|
210 |
+
|
211 |
+
# map tensor names
|
212 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
213 |
+
if new_name is None:
|
214 |
+
print("Can not map tensor '" + name + "'")
|
215 |
+
sys.exit()
|
216 |
+
|
217 |
+
n_dims = len(data.shape)
|
218 |
+
data_dtype = data.dtype
|
219 |
+
|
220 |
+
# if f32 desired, convert any float16 to float32
|
221 |
+
if ftype == 0 and data_dtype == np.float16:
|
222 |
+
data = data.astype(np.float32)
|
223 |
+
|
224 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
225 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
226 |
+
data = data.astype(np.float32)
|
227 |
+
|
228 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
229 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
230 |
+
data = data.astype(np.float16)
|
231 |
+
|
232 |
+
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
233 |
+
|
234 |
+
gguf_writer.add_tensor(new_name, data)
|
235 |
+
|
236 |
+
|
237 |
+
print("gguf: write header")
|
238 |
+
gguf_writer.write_header_to_file()
|
239 |
+
print("gguf: write metadata")
|
240 |
+
gguf_writer.write_kv_data_to_file()
|
241 |
+
if not args.vocab_only:
|
242 |
+
print("gguf: write tensors")
|
243 |
+
gguf_writer.write_tensors_to_file()
|
244 |
+
|
245 |
+
gguf_writer.close()
|
246 |
+
|
247 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
248 |
+
print("")
|
convert.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
#!/usr/bin/env
|
|
|
|
|
2 |
import argparse
|
3 |
import concurrent.futures
|
4 |
import copy
|
@@ -15,141 +17,151 @@ import re
|
|
15 |
import signal
|
16 |
import struct
|
17 |
import sys
|
|
|
18 |
import zipfile
|
19 |
from abc import ABCMeta, abstractmethod
|
|
|
20 |
from dataclasses import dataclass
|
21 |
from pathlib import Path
|
22 |
-
from typing import
|
23 |
-
Literal, Optional, Sequence, Tuple, TypeVar, Union)
|
24 |
|
25 |
import numpy as np
|
26 |
-
from sentencepiece import SentencePieceProcessor # type: ignore
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
if TYPE_CHECKING:
|
29 |
-
from
|
30 |
|
31 |
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
32 |
faulthandler.register(signal.SIGUSR1)
|
33 |
|
34 |
-
NDArray:
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
@dataclass(frozen=True)
|
38 |
-
class
|
39 |
name: str
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
DT_F32 = UnquantizedDataType('F32')
|
44 |
-
DT_I32 = UnquantizedDataType('I32')
|
45 |
-
DT_BF16 = UnquantizedDataType('BF16')
|
46 |
-
|
47 |
|
48 |
@dataclass(frozen=True)
|
49 |
-
class
|
50 |
-
|
51 |
-
have_addends: bool
|
52 |
-
have_g_idx: bool
|
53 |
-
|
54 |
|
55 |
-
|
56 |
-
|
|
|
|
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
DT_F16: 1,
|
63 |
-
DT_Q4_0: 2,
|
64 |
-
DT_Q4_1: 3,
|
65 |
-
}
|
66 |
|
67 |
-
|
68 |
-
|
|
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
}
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
|
89 |
-
if len(tensor.shape) == 1:
|
90 |
-
# 1D tensors are always F32.
|
91 |
-
return DT_F32
|
92 |
-
elif self == GGMLFileType.AllF32:
|
93 |
-
return DT_F32
|
94 |
-
elif self == GGMLFileType.MostlyF16:
|
95 |
-
return DT_F16
|
96 |
-
elif self == GGMLFileType.MostlyQ4_0:
|
97 |
-
return DT_Q4_0
|
98 |
-
elif self == GGMLFileType.MostlyQ4_1:
|
99 |
-
return DT_Q4_1
|
100 |
-
elif self == GGMLFileType.PerLayerIsQ4_1:
|
101 |
-
if name in ('output.weight', 'tok_embeddings.weight'):
|
102 |
-
return DT_F16
|
103 |
-
else:
|
104 |
-
return DT_Q4_1
|
105 |
-
else:
|
106 |
raise ValueError(self)
|
|
|
|
|
107 |
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
'norm.weight',
|
113 |
-
'output.weight',
|
114 |
-
]
|
115 |
-
for i in range(80): # maximum number of layer
|
116 |
-
ret += [
|
117 |
-
f'layers.{i}.attention.wq.weight',
|
118 |
-
f'layers.{i}.attention.wk.weight',
|
119 |
-
f'layers.{i}.attention.wv.weight',
|
120 |
-
f'layers.{i}.attention.wo.weight',
|
121 |
-
f'layers.{i}.attention_norm.weight',
|
122 |
-
f'layers.{i}.feed_forward.w1.weight',
|
123 |
-
f'layers.{i}.feed_forward.w2.weight',
|
124 |
-
f'layers.{i}.feed_forward.w3.weight',
|
125 |
-
f'layers.{i}.ffn_norm.weight',
|
126 |
-
]
|
127 |
-
return ret
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
-
|
131 |
-
|
132 |
|
|
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
for n_mult in range(8192, 1, -1):
|
137 |
-
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
138 |
-
if calc_ff == n_ff:
|
139 |
-
return n_mult
|
140 |
-
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
|
141 |
-
|
142 |
-
@dataclass
|
143 |
-
class Params:
|
144 |
-
n_vocab: int
|
145 |
-
n_embd: int
|
146 |
-
n_mult: int
|
147 |
-
n_head: int
|
148 |
-
n_layer: int
|
149 |
-
n_kv_head: Optional[int] # This parameter is only used for Llama 2
|
150 |
|
151 |
@staticmethod
|
152 |
-
def guessed(model:
|
153 |
# try transformer naming first
|
154 |
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
155 |
|
@@ -165,65 +177,110 @@ class Params:
|
|
165 |
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
166 |
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
167 |
|
168 |
-
n_head=n_embd // 128 # guessed
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
return Params(
|
171 |
-
n_vocab
|
172 |
-
n_embd
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
177 |
)
|
178 |
|
179 |
@staticmethod
|
180 |
-
def loadHFTransformerJson(model:
|
181 |
config = json.load(open(config_path))
|
182 |
|
183 |
-
n_vocab
|
184 |
-
n_embd
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
return Params(
|
193 |
-
n_vocab
|
194 |
-
n_embd
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
199 |
)
|
200 |
|
201 |
# LLaMA v2 70B params.json
|
202 |
-
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
|
203 |
@staticmethod
|
204 |
-
def loadOriginalParamsJson(model:
|
205 |
config = json.load(open(config_path))
|
206 |
|
207 |
-
n_vocab
|
208 |
-
n_embd
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
if n_vocab == -1:
|
214 |
n_vocab = model["tok_embeddings.weight"].shape[0]
|
215 |
|
|
|
|
|
|
|
216 |
return Params(
|
217 |
-
n_vocab
|
218 |
-
n_embd
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
223 |
)
|
224 |
|
225 |
@staticmethod
|
226 |
-
def load(model_plus:
|
227 |
hf_config_path = model_plus.paths[0].parent / "config.json"
|
228 |
orig_config_path = model_plus.paths[0].parent / "params.json"
|
229 |
|
@@ -231,33 +288,104 @@ class Params:
|
|
231 |
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
|
232 |
elif orig_config_path.exists():
|
233 |
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
|
234 |
-
|
235 |
params = Params.guessed(model_plus.model)
|
|
|
|
|
|
|
|
|
236 |
|
237 |
-
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
|
238 |
return params
|
239 |
|
240 |
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
else:
|
247 |
-
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
if fname_added_tokens is not None:
|
250 |
-
added_tokens = json.load(open(fname_added_tokens))
|
251 |
else:
|
252 |
added_tokens = {}
|
253 |
-
|
254 |
-
|
255 |
-
else:
|
256 |
-
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
257 |
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
258 |
-
actual_ids
|
259 |
if expected_ids != actual_ids:
|
260 |
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
|
|
|
261 |
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
262 |
self.added_tokens_list = [text for (text, idx) in items]
|
263 |
self.vocab_size_base: int = vocab_size
|
@@ -265,126 +393,74 @@ class SentencePieceVocab:
|
|
265 |
self.fname_tokenizer = fname_tokenizer
|
266 |
self.fname_added_tokens = fname_added_tokens
|
267 |
|
268 |
-
def sentencepiece_tokens(self) -> Iterable[
|
269 |
tokenizer = self.sentencepiece_tokenizer
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
294 |
-
score: float = tokenizer.get_score(i)
|
295 |
-
yield text, score
|
296 |
-
|
297 |
-
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
298 |
for text in self.added_tokens_list:
|
299 |
score = -1000.0
|
300 |
-
yield text.encode("utf-8"), score
|
301 |
|
302 |
-
def all_tokens(self) -> Iterable[
|
303 |
yield from self.sentencepiece_tokens()
|
304 |
yield from self.added_tokens()
|
305 |
|
306 |
def __repr__(self) -> str:
|
307 |
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
308 |
|
|
|
309 |
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
return f"<GGMLVocab with {self.vocab_size} tokens>"
|
320 |
-
|
321 |
-
|
322 |
-
Vocab = Union[SentencePieceVocab, GGMLVocab]
|
323 |
-
|
324 |
-
|
325 |
-
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
|
326 |
-
if n_kv_head is not None and n_head != n_kv_head:
|
327 |
-
n_head //= n_kv_head
|
328 |
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
329 |
.swapaxes(1, 2)
|
330 |
.reshape(weights.shape))
|
331 |
|
332 |
|
333 |
-
def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
|
334 |
-
# First reinterpret each row from a list of int32s containing 8 values each
|
335 |
-
# to a list of uint8s containing 2 values each.
|
336 |
-
qvalues_pack8 = qvalues_pack32.view(np.uint8)
|
337 |
-
|
338 |
-
# Then split out the two values per int8 (which requires an actual
|
339 |
-
# conversion because numpy doesn't natively support int4s).
|
340 |
-
qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
|
341 |
-
qvalues[:, 0::2] = qvalues_pack8 & 0xf
|
342 |
-
qvalues[:, 1::2] = qvalues_pack8 >> 4
|
343 |
-
|
344 |
-
assert addends is None or addends.shape == scales.shape
|
345 |
-
assert qvalues.shape[0] == scales.shape[0]
|
346 |
-
assert qvalues.shape[1] % scales.shape[1] == 0
|
347 |
-
if g_idx is None:
|
348 |
-
repeat_count = qvalues.shape[1] // scales.shape[1]
|
349 |
-
scales = scales[:, :, np.newaxis]
|
350 |
-
if addends is not None:
|
351 |
-
addends = addends[:, :, np.newaxis]
|
352 |
-
# Reshape so that the below computation broadcasts over scales and addends:
|
353 |
-
qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
|
354 |
-
else:
|
355 |
-
# In this case the scale and addend is selected for each column by g_idx:
|
356 |
-
assert addends is not None
|
357 |
-
scales = scales[:, g_idx]
|
358 |
-
addends = addends[:, g_idx]
|
359 |
-
if addends is None:
|
360 |
-
# Q4_0
|
361 |
-
qvalues = qvalues.view(np.int8)
|
362 |
-
qvalues -= 8
|
363 |
-
# And do the actual 'value = scale * qvalue + addend' computation.
|
364 |
-
values = scales * qvalues
|
365 |
-
if addends is not None:
|
366 |
-
values += addends
|
367 |
-
if g_idx is None:
|
368 |
-
values.shape = (values.shape[0], values.shape[1] * values.shape[2])
|
369 |
-
return values
|
370 |
-
|
371 |
-
|
372 |
class Tensor(metaclass=ABCMeta):
|
373 |
data_type: DataType
|
374 |
|
375 |
@abstractmethod
|
376 |
-
def astype(self, data_type: DataType) ->
|
377 |
@abstractmethod
|
378 |
-
def permute(self, n_head: int,
|
379 |
@abstractmethod
|
380 |
-
def permute_part(self, n_part: int, n_head: int) ->
|
381 |
@abstractmethod
|
382 |
-
def part(self, n_part: int) ->
|
383 |
@abstractmethod
|
384 |
-
def to_ggml(self) ->
|
385 |
|
386 |
|
387 |
-
def bf16_to_fp32(bf16_arr: np.ndarray) ->
|
388 |
assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
|
389 |
fp32_arr = bf16_arr.astype(np.uint32) << 16
|
390 |
return fp32_arr.view(np.float32)
|
@@ -397,27 +473,27 @@ class UnquantizedTensor(Tensor):
|
|
397 |
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
398 |
|
399 |
def astype(self, data_type: DataType) -> Tensor:
|
400 |
-
dtype =
|
401 |
if self.data_type == DT_BF16:
|
402 |
self.ndarray = bf16_to_fp32(self.ndarray)
|
403 |
return UnquantizedTensor(self.ndarray.astype(dtype))
|
404 |
|
405 |
-
def to_ggml(self) ->
|
406 |
return self
|
407 |
|
408 |
-
def permute_part(self, n_part: int, n_head: int) ->
|
409 |
r = self.ndarray.shape[0] // 3
|
410 |
-
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
|
411 |
|
412 |
-
def part(self, n_part: int) ->
|
413 |
r = self.ndarray.shape[0] // 3
|
414 |
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
|
415 |
|
416 |
-
def permute(self, n_head: int,
|
417 |
-
return UnquantizedTensor(permute(self.ndarray, n_head,
|
418 |
|
419 |
|
420 |
-
def load_unquantized(lazy_tensor:
|
421 |
tensor = lazy_tensor.load()
|
422 |
assert isinstance(tensor, UnquantizedTensor)
|
423 |
|
@@ -433,196 +509,24 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
|
|
433 |
return tensor.ndarray
|
434 |
|
435 |
|
436 |
-
|
437 |
-
data_type: QuantizedDataType
|
438 |
-
|
439 |
-
def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
|
440 |
-
rows, columns = shape
|
441 |
-
assert data_type in (DT_Q4_1, DT_Q4_0) # for now
|
442 |
-
assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this
|
443 |
-
assert columns % data_type.groupsize == 0
|
444 |
-
words_in_block = 6 if data_type == DT_Q4_1 else 5
|
445 |
-
self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
|
446 |
-
self.shape = shape[:]
|
447 |
-
self.data_type = data_type
|
448 |
-
|
449 |
-
def astype(self, data_type: DataType) -> Tensor:
|
450 |
-
if data_type == self.data_type:
|
451 |
-
return self
|
452 |
-
scales = self.ndarray[:, :, 0].view(np.float32)
|
453 |
-
if self.data_type.have_addends:
|
454 |
-
addends = self.ndarray[:, :, 1].view(np.float32)
|
455 |
-
else:
|
456 |
-
addends = None
|
457 |
-
qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
|
458 |
-
|
459 |
-
dq = dequantize_q4(qweights, scales, addends, g_idx=None)
|
460 |
-
return UnquantizedTensor(dq).astype(data_type)
|
461 |
-
|
462 |
-
def to_ggml(self) -> 'GGMLQuantizedTensor':
|
463 |
-
return self
|
464 |
-
|
465 |
-
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
|
466 |
-
return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
|
467 |
-
|
468 |
-
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
|
469 |
-
r = self.ndarray.shape[0] // 3
|
470 |
-
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
|
471 |
-
|
472 |
-
def part(self, n_part: int) -> 'UnquantizedTensor':
|
473 |
-
r = self.ndarray.shape[0] // 3
|
474 |
-
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
|
475 |
-
|
476 |
-
GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
|
477 |
-
|
478 |
-
|
479 |
-
class DeferredPermutedTensor(Tensor):
|
480 |
-
def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
|
481 |
-
self.base = base
|
482 |
-
self.n_head = n_head
|
483 |
-
self.n_kv_head = n_kv_head
|
484 |
-
self.data_type = self.base.data_type
|
485 |
-
|
486 |
-
def astype(self, data_type: DataType) -> Tensor:
|
487 |
-
return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
|
488 |
-
|
489 |
-
def to_ggml(self) -> GGMLCompatibleTensor:
|
490 |
-
return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
|
491 |
-
|
492 |
-
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
|
493 |
-
raise Exception("shouldn't permute twice")
|
494 |
-
|
495 |
-
|
496 |
-
class GPTQForLLaMaQuantizedTensor(Tensor):
|
497 |
-
def __init__(self, model: 'LazyModel', namebase: str) -> None:
|
498 |
-
qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
|
499 |
-
scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
|
500 |
-
|
501 |
-
bias = model.get(f"{namebase}.bias")
|
502 |
-
if bias is not None:
|
503 |
-
# Q4_1 does not support bias; good thing the bias is always all zeros.
|
504 |
-
assert not np.any(load_unquantized(bias))
|
505 |
-
|
506 |
-
if f"{namebase}.zeros" in model:
|
507 |
-
zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
|
508 |
-
else:
|
509 |
-
qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
|
510 |
-
assert qzeros.dtype == np.int32
|
511 |
-
zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
|
512 |
-
assert zeros.dtype == np.float32
|
513 |
-
|
514 |
-
assert zeros.shape == scales.shape
|
515 |
-
|
516 |
-
# Output is transposed compared to the input, and addends have their sign flipped.
|
517 |
-
# Scales and zeros similarly must be transposed but only for newer
|
518 |
-
# versions of GPTQ-for-LLaMa; the older versions can be identified by
|
519 |
-
# having shape (n_embd, 1).
|
520 |
-
qweight = qweight.T
|
521 |
-
if scales.shape[1] != 1:
|
522 |
-
scales = scales.T
|
523 |
-
zeros = zeros.T
|
524 |
-
|
525 |
-
# Output also has signs flipped for the addends.
|
526 |
-
self.qweight = qweight
|
527 |
-
self.scales = scales
|
528 |
-
self.addends = -zeros
|
529 |
-
|
530 |
-
self.g_idx: Optional[NDArray]
|
531 |
-
if f"{namebase}.g_idx" in model:
|
532 |
-
self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
|
533 |
-
assert self.g_idx.shape == (qweight.shape[1] * 8,)
|
534 |
-
else:
|
535 |
-
self.g_idx = None
|
536 |
-
|
537 |
-
self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
|
538 |
-
self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True,
|
539 |
-
have_g_idx=(self.g_idx is not None))
|
540 |
-
|
541 |
-
def inspect(self, row: int, col: int) -> None:
|
542 |
-
'''For debugging.'''
|
543 |
-
qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf
|
544 |
-
if self.g_idx is not None:
|
545 |
-
group = self.g_idx[col]
|
546 |
-
else:
|
547 |
-
group = int(col // self.groupsize())
|
548 |
-
scale = self.scales[row, group]
|
549 |
-
addend = self.addends[row, group]
|
550 |
-
with np.printoptions(precision=None, suppress=True):
|
551 |
-
print(f'scale:{scale} addend:{addend} qweight:{qweight}')
|
552 |
-
print('possible values:', np.arange(16) * scale + addend)
|
553 |
-
print('actual value:', qweight * scale + addend)
|
554 |
-
|
555 |
-
def astype(self, data_type: DataType) -> Tensor:
|
556 |
-
if isinstance(data_type, QuantizedDataType):
|
557 |
-
assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
|
558 |
-
return self.regroup(data_type.groupsize)
|
559 |
-
|
560 |
-
dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
|
561 |
-
return UnquantizedTensor(dequantized).astype(data_type)
|
562 |
-
|
563 |
-
def groupsize(self) -> int:
|
564 |
-
assert self.addends.shape == self.scales.shape
|
565 |
-
assert self.shape[1] % self.scales.shape[1] == 0
|
566 |
-
return self.shape[1] // self.scales.shape[1]
|
567 |
-
|
568 |
-
def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
|
569 |
-
# Old versions of GPTQ-for-LLaMa shared scales and addends between all the
|
570 |
-
# columns in a row. Newer versions share them between every set of N
|
571 |
-
# columns in a row, where N is the `groupsize` parameter, usually 128. The
|
572 |
-
# output format shares them between every set of 32 columns. To handle
|
573 |
-
# this, duplicate scales and addends for every smaller group.
|
574 |
-
# (In the above, 'row' and 'column' are in the sense of the output.)
|
575 |
-
assert self.g_idx is None
|
576 |
-
old_groupsize = self.groupsize()
|
577 |
-
assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
|
578 |
-
ret = copy.copy(self)
|
579 |
-
ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
|
580 |
-
ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
|
581 |
-
ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
|
582 |
-
return ret
|
583 |
-
|
584 |
-
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
|
585 |
-
return DeferredPermutedTensor(self, n_head, n_kv_head)
|
586 |
-
|
587 |
-
def to_ggml(self) -> GGMLQuantizedTensor:
|
588 |
-
# The output format looks like this:
|
589 |
-
# For each row:
|
590 |
-
# For each group of 32 columns:
|
591 |
-
# - addend (float32, 4 bytes)
|
592 |
-
# - scale (float32, 4 bytes)
|
593 |
-
# - weights (int4 * 32, 16 bytes)
|
594 |
-
|
595 |
-
if self.groupsize() != 32:
|
596 |
-
raise Exception("should have been regrouped before converting to ggml")
|
597 |
-
|
598 |
-
# Since the output format is mixed between integers and floats, we have
|
599 |
-
# to hackily view the floats as int32s just so numpy will let us
|
600 |
-
# concatenate them.
|
601 |
-
addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
|
602 |
-
scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
|
603 |
-
|
604 |
-
# Split into groups of 4 columns (i.e. 32 columns of quantized data):
|
605 |
-
grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
|
606 |
-
|
607 |
-
# And concatenate:
|
608 |
-
grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
|
609 |
-
|
610 |
-
return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
|
611 |
|
612 |
|
613 |
@dataclass
|
614 |
class LazyTensor:
|
615 |
_load: Callable[[], Tensor]
|
616 |
-
shape:
|
617 |
data_type: DataType
|
618 |
description: str
|
619 |
|
620 |
def load(self) -> Tensor:
|
621 |
ret = self._load()
|
622 |
-
|
|
|
|
|
623 |
return ret
|
624 |
|
625 |
-
def astype(self, data_type: DataType) ->
|
626 |
self.validate_conversion_to(data_type)
|
627 |
|
628 |
def load() -> Tensor:
|
@@ -630,39 +534,28 @@ class LazyTensor:
|
|
630 |
return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
|
631 |
|
632 |
def validate_conversion_to(self, data_type: DataType) -> None:
|
633 |
-
if data_type
|
634 |
-
|
635 |
-
if isinstance(data_type, QuantizedDataType):
|
636 |
-
if not isinstance(self.data_type, QuantizedDataType):
|
637 |
-
raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
|
638 |
-
if self.data_type.have_g_idx:
|
639 |
-
sys.stderr.write(
|
640 |
-
"Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
|
641 |
-
"which is not yet natively supported by GGML. "
|
642 |
-
"For now you can still convert this model by passing `--outtype f16` to dequantize, "
|
643 |
-
"but that will result in a much larger output file for no quality benefit.\n")
|
644 |
-
sys.exit(1)
|
645 |
-
assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
|
646 |
|
647 |
|
648 |
-
LazyModel =
|
649 |
|
650 |
|
651 |
@dataclass
|
652 |
class ModelPlus:
|
653 |
model: LazyModel
|
654 |
-
paths:
|
655 |
-
format: Literal['ggml', 'torch', 'safetensors']
|
656 |
-
vocab:
|
657 |
|
658 |
|
659 |
-
def merge_sharded(models:
|
660 |
# Original LLaMA models have each file contain one part of each tensor.
|
661 |
# Use a dict instead of a set to preserve order.
|
662 |
names = {name: None for model in models for name in model}
|
663 |
|
664 |
def convert(name: str) -> LazyTensor:
|
665 |
-
lazy_tensors:
|
666 |
if len(lazy_tensors) == 1:
|
667 |
# only one file; don't go through this procedure since there might
|
668 |
# be quantized tensors
|
@@ -690,7 +583,7 @@ def merge_sharded(models: List[LazyModel]) -> LazyModel:
|
|
690 |
return {name: convert(name) for name in names}
|
691 |
|
692 |
|
693 |
-
def merge_multifile_models(models_plus:
|
694 |
formats = set(mp.format for mp in models_plus)
|
695 |
assert len(formats) == 1, "different formats?"
|
696 |
format = formats.pop()
|
@@ -713,17 +606,17 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
|
|
713 |
return ModelPlus(model, paths, format, vocab)
|
714 |
|
715 |
|
716 |
-
def permute_lazy(lazy_tensor: LazyTensor, n_head: int,
|
717 |
def load() -> Tensor:
|
718 |
-
return lazy_tensor.load().permute(n_head,
|
719 |
-
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {
|
720 |
|
721 |
-
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
|
722 |
def load() -> Tensor:
|
723 |
-
return lazy_tensor.load().permute_part(n_part, n_head)
|
724 |
s = lazy_tensor.shape.copy()
|
725 |
s[0] = s[0] // 3
|
726 |
-
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
727 |
|
728 |
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
729 |
def load() -> Tensor:
|
@@ -732,66 +625,6 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
|
732 |
s[0] = s[0] // 3
|
733 |
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
|
734 |
|
735 |
-
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
736 |
-
out: LazyModel = {}
|
737 |
-
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
738 |
-
out["norm.weight"] = model["model.norm.weight"]
|
739 |
-
out["output.weight"] = model["lm_head.weight"]
|
740 |
-
|
741 |
-
for i in itertools.count():
|
742 |
-
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
743 |
-
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
744 |
-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
|
745 |
-
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
746 |
-
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
747 |
-
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
|
748 |
-
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
|
749 |
-
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
|
750 |
-
else:
|
751 |
-
break
|
752 |
-
|
753 |
-
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
754 |
-
|
755 |
-
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
|
756 |
-
out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
|
757 |
-
out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
|
758 |
-
|
759 |
-
out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
|
760 |
-
out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
|
761 |
-
return out
|
762 |
-
|
763 |
-
|
764 |
-
def handle_quantization(model: LazyModel) -> LazyModel:
|
765 |
-
'''Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
|
766 |
-
(which resolve to UnquantizedTensors with the raw data) to one with entries
|
767 |
-
for 'foo.weight' (which resolve to QuantizedTensors).
|
768 |
-
'''
|
769 |
-
def convert(name: str) -> Tuple[str, LazyTensor]:
|
770 |
-
if name.endswith(".qweight"):
|
771 |
-
namebase = name.rsplit('.', 1)[0]
|
772 |
-
orig_name = namebase + ".weight"
|
773 |
-
|
774 |
-
lazy_tensor = model[name]
|
775 |
-
assert len(lazy_tensor.shape) == 2
|
776 |
-
real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
|
777 |
-
|
778 |
-
# Calculate type. This replicates the logic in
|
779 |
-
# GPTQForLLaMaQuantizedTensor (which is executed when the modelis
|
780 |
-
# actually loaded).
|
781 |
-
lazy_scales = model[f"{namebase}.scales"]
|
782 |
-
scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
|
783 |
-
assert real_shape[1] % scales_width == 0
|
784 |
-
groupsize = real_shape[1] // scales_width
|
785 |
-
have_g_idx = f"{namebase}.g_idx" in model
|
786 |
-
data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
|
787 |
-
|
788 |
-
def load() -> Tensor:
|
789 |
-
return GPTQForLLaMaQuantizedTensor(model, namebase)
|
790 |
-
|
791 |
-
return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
|
792 |
-
else:
|
793 |
-
return (name, model[name])
|
794 |
-
return dict(convert(name) for name in model)
|
795 |
|
796 |
# Functionality that simulates `torch.load` but where individual tensors are
|
797 |
# only loaded into memory on demand, not all at once.
|
@@ -824,13 +657,11 @@ class LazyUnpickler(pickle.Unpickler):
|
|
824 |
assert isinstance(pid[1], LazyStorageKind)
|
825 |
data_type = pid[1].data_type
|
826 |
filename_stem = pid[2]
|
827 |
-
filename = self.data_base_path
|
828 |
info = self.zip_file.getinfo(filename)
|
829 |
|
830 |
def load(offset: int, elm_count: int) -> NDArray:
|
831 |
-
dtype =
|
832 |
-
if dtype is None:
|
833 |
-
raise Exception("tensor stored in unsupported format")
|
834 |
fp = self.zip_file.open(info)
|
835 |
fp.seek(offset * dtype.itemsize)
|
836 |
size = elm_count * dtype.itemsize
|
@@ -840,9 +671,8 @@ class LazyUnpickler(pickle.Unpickler):
|
|
840 |
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
841 |
return LazyStorage(load=load, kind=pid[1], description=description)
|
842 |
|
843 |
-
|
844 |
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
|
845 |
-
# pyright: ignore[reportSelfClsParameterName]
|
846 |
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
847 |
assert isinstance(storage, LazyStorage)
|
848 |
|
@@ -852,13 +682,15 @@ class LazyUnpickler(pickle.Unpickler):
|
|
852 |
description = f'pickled storage_offset={storage_offset} in {storage.description}'
|
853 |
return LazyTensor(load, list(size), storage.kind.data_type, description)
|
854 |
|
855 |
-
|
856 |
def rebuild_from_type_v2(func, new_type, args, state):
|
857 |
return func(*args)
|
858 |
|
859 |
-
CLASSES:
|
860 |
-
|
861 |
-
|
|
|
|
|
862 |
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
|
863 |
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
|
864 |
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
|
@@ -885,25 +717,17 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
|
|
885 |
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
|
886 |
|
887 |
|
888 |
-
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
889 |
-
'BF16': DT_BF16,
|
890 |
-
'F16': DT_F16,
|
891 |
-
'F32': DT_F32,
|
892 |
-
'I32': DT_I32,
|
893 |
-
}
|
894 |
-
|
895 |
-
|
896 |
def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
897 |
header_size, = struct.unpack('<Q', fp.read(8))
|
898 |
-
header:
|
899 |
# Use mmap for the actual data to avoid race conditions with the file offset.
|
900 |
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
|
901 |
byte_buf = mapped[8 + header_size:]
|
902 |
|
903 |
-
def convert(info:
|
904 |
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
|
905 |
-
numpy_dtype =
|
906 |
-
shape:
|
907 |
begin, end = info['data_offsets']
|
908 |
assert 0 <= begin <= end <= len(byte_buf)
|
909 |
assert end - begin == math.prod(shape) * numpy_dtype.itemsize
|
@@ -924,84 +748,6 @@ def must_read(fp: IO[bytes], length: int) -> bytes:
|
|
924 |
return ret
|
925 |
|
926 |
|
927 |
-
def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
|
928 |
-
magic = must_read(fp, 4)[::-1]
|
929 |
-
if magic in (b'ggmf', b'ggjt'):
|
930 |
-
version, = struct.unpack("i", must_read(fp, 4))
|
931 |
-
assert version == 1
|
932 |
-
else:
|
933 |
-
assert magic == b'ggml'
|
934 |
-
version = None
|
935 |
-
n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
|
936 |
-
|
937 |
-
tokens: List[Tuple[bytes, float]] = []
|
938 |
-
for i in range(n_vocab):
|
939 |
-
if i == 32000:
|
940 |
-
# HACK: GPT4All messed with the format without changing the magic
|
941 |
-
# number. Specifically, they changed the vocab section to contain
|
942 |
-
# `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
|
943 |
-
# extra pad token). Try to detect if we're reading a file like
|
944 |
-
# this.
|
945 |
-
orig_pos = fp.tell()
|
946 |
-
fp.seek(20, io.SEEK_CUR)
|
947 |
-
is_gpt4all = fp.read(21) == b'tok_embeddings.weight'
|
948 |
-
fp.seek(orig_pos)
|
949 |
-
if is_gpt4all:
|
950 |
-
break
|
951 |
-
|
952 |
-
length, = struct.unpack("i", must_read(fp, 4))
|
953 |
-
text = must_read(fp, length)
|
954 |
-
if magic != b'ggml':
|
955 |
-
score, = struct.unpack("f", must_read(fp, 4))
|
956 |
-
tokens.append((text, score))
|
957 |
-
vocab = GGMLVocab(tokens) if magic != b'ggml' else None
|
958 |
-
|
959 |
-
model: LazyModel = {}
|
960 |
-
# Use mmap for the actual data to avoid race conditions with the file offset.
|
961 |
-
off = fp.raw.tell()
|
962 |
-
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
|
963 |
-
fp.raw.seek(off) # needed on Windows
|
964 |
-
|
965 |
-
def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
|
966 |
-
shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
|
967 |
-
assert 0 <= shape_len <= 3
|
968 |
-
shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
|
969 |
-
shape = shape[::-1]
|
970 |
-
name = must_read(fp, name_len).decode('utf-8')
|
971 |
-
data_type = FTYPE_TO_DATA_TYPE[ftype]
|
972 |
-
|
973 |
-
if magic == b'ggjt':
|
974 |
-
fp.seek((fp.tell() + 31) & -32)
|
975 |
-
|
976 |
-
if data_type == DT_Q4_1:
|
977 |
-
# See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
|
978 |
-
size = 24 * (shape[1] // 32) * shape[0]
|
979 |
-
elif data_type == DT_Q4_0:
|
980 |
-
size = 20 * (shape[1] // 32) * shape[0]
|
981 |
-
else:
|
982 |
-
numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
|
983 |
-
elm_count = math.prod(shape)
|
984 |
-
size = elm_count * numpy_dtype.itemsize
|
985 |
-
offset = fp.tell()
|
986 |
-
buf = mapped[offset:offset+size]
|
987 |
-
fp.seek(size, io.SEEK_CUR)
|
988 |
-
|
989 |
-
def load() -> Tensor:
|
990 |
-
if isinstance(data_type, QuantizedDataType):
|
991 |
-
ndarray = np.frombuffer(buf, dtype=np.uint32)
|
992 |
-
return GGMLQuantizedTensor(ndarray, shape, data_type)
|
993 |
-
else:
|
994 |
-
return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
|
995 |
-
description = f'ggml offset={offset} type={data_type} path={path}'
|
996 |
-
model[name] = LazyTensor(load, shape, data_type, description)
|
997 |
-
|
998 |
-
while fp.read(1) != b'':
|
999 |
-
fp.seek(-1, io.SEEK_CUR)
|
1000 |
-
read_tensor()
|
1001 |
-
|
1002 |
-
return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab)
|
1003 |
-
|
1004 |
-
|
1005 |
@functools.lru_cache(maxsize=None)
|
1006 |
def lazy_load_file(path: Path) -> ModelPlus:
|
1007 |
fp = open(path, 'rb')
|
@@ -1010,9 +756,6 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
|
1010 |
if first8[:2] == b'PK':
|
1011 |
# A zip file, i.e. PyTorch format
|
1012 |
return lazy_load_torch_file(fp, path)
|
1013 |
-
elif first8[2:4] == b'gg':
|
1014 |
-
# GGML format
|
1015 |
-
return lazy_load_ggml_file(fp, path)
|
1016 |
elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
|
1017 |
# Probably safetensors
|
1018 |
return lazy_load_safetensors_file(fp, path)
|
@@ -1023,28 +766,43 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
|
1023 |
In = TypeVar('In')
|
1024 |
Out = TypeVar('Out')
|
1025 |
|
1026 |
-
|
1027 |
-
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
|
1028 |
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
1029 |
fast enough, this will stop calling `func` at some point rather than
|
1030 |
letting results pile up in memory. Specifically, there is a max of one
|
1031 |
output value buffered per thread.'''
|
1032 |
-
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
1036 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1037 |
while futures:
|
1038 |
result = futures.pop(0).result()
|
1039 |
-
|
1040 |
-
|
|
|
|
|
|
|
|
|
1041 |
yield result
|
1042 |
|
1043 |
-
|
1044 |
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
1045 |
if params.n_vocab != vocab.vocab_size:
|
1046 |
-
|
1047 |
-
assert isinstance(vocab, SentencePieceVocab)
|
1048 |
if params.n_vocab == vocab.vocab_size_base:
|
1049 |
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
1050 |
vocab.added_tokens_list = []
|
@@ -1061,105 +819,200 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
|
1061 |
|
1062 |
class OutputFile:
|
1063 |
def __init__(self, fname_out: Path) -> None:
|
1064 |
-
self.
|
1065 |
-
|
1066 |
-
def
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
-
|
1072 |
-
|
1073 |
-
params.
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
|
1078 |
-
self.
|
1079 |
-
|
1080 |
-
|
1081 |
-
|
1082 |
-
self.
|
1083 |
-
self.
|
1084 |
-
|
1085 |
-
|
1086 |
-
|
1087 |
-
|
1088 |
-
|
1089 |
-
self.
|
1090 |
-
|
1091 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1092 |
|
1093 |
@staticmethod
|
1094 |
-
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
1095 |
-
|
1096 |
-
|
1097 |
of = OutputFile(fname_out)
|
1098 |
-
|
1099 |
-
|
1100 |
-
of.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1101 |
|
1102 |
@staticmethod
|
1103 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1104 |
check_vocab_size(params, vocab)
|
|
|
1105 |
of = OutputFile(fname_out)
|
1106 |
-
of.write_file_header(params, file_type)
|
1107 |
-
print("Writing vocab...")
|
1108 |
-
of.write_vocab(vocab)
|
1109 |
|
1110 |
-
|
1111 |
-
|
1112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1113 |
|
1114 |
-
|
1115 |
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
|
|
1116 |
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
1117 |
padi = len(str(len(model)))
|
1118 |
-
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
|
1119 |
-
of.
|
1120 |
-
ndarray.tofile(of.fout)
|
1121 |
-
of.fout.close()
|
1122 |
|
|
|
1123 |
|
1124 |
-
def pick_output_type(model: LazyModel, output_type_str:
|
1125 |
-
wq_type = model[
|
1126 |
-
|
|
|
1127 |
return GGMLFileType.AllF32
|
1128 |
-
if output_type_str == "f16" or (output_type_str is None and wq_type
|
1129 |
return GGMLFileType.MostlyF16
|
1130 |
-
if output_type_str == "
|
1131 |
-
|
1132 |
-
|
1133 |
-
return GGMLFileType.MostlyQ4_1
|
1134 |
-
else:
|
1135 |
-
return GGMLFileType.PerLayerIsQ4_1
|
1136 |
-
if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
|
1137 |
-
return GGMLFileType.MostlyQ4_0
|
1138 |
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
|
|
1139 |
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
1140 |
|
|
|
|
|
|
|
1141 |
|
1142 |
-
def
|
1143 |
-
|
|
|
1144 |
|
1145 |
-
|
1146 |
-
model = convert_transformers_to_orig(model, params)
|
1147 |
-
model = filter_and_sort_tensors(model)
|
1148 |
|
1149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1150 |
|
|
|
|
|
|
|
|
|
|
|
1151 |
|
1152 |
-
|
1153 |
-
|
1154 |
-
|
1155 |
|
|
|
|
|
1156 |
|
1157 |
-
|
|
|
|
|
1158 |
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
1159 |
the nth path in the model.
|
1160 |
'''
|
1161 |
# Support the following patterns:
|
1162 |
-
patterns:
|
1163 |
# - x.00.pth, x.01.pth, etc.
|
1164 |
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
1165 |
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
@@ -1175,11 +1028,11 @@ def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
|
|
1175 |
return None
|
1176 |
|
1177 |
|
1178 |
-
def find_multifile_paths(path: Path) ->
|
1179 |
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
1180 |
the whole list of paths in the model.
|
1181 |
'''
|
1182 |
-
ret:
|
1183 |
for i in itertools.count():
|
1184 |
nth_path = nth_multifile_path(path, i)
|
1185 |
if nth_path is None:
|
@@ -1203,11 +1056,6 @@ def load_some_model(path: Path) -> ModelPlus:
|
|
1203 |
# Try the PyTorch patterns too, with lower priority
|
1204 |
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
1205 |
files = [file for glob in globs for file in path.glob(glob)]
|
1206 |
-
if not files:
|
1207 |
-
# Try GGML too, but with lower priority, since if both a non-GGML
|
1208 |
-
# model and a GGML model exist in the same directory, we assume the
|
1209 |
-
# latter was converted from the former.
|
1210 |
-
files = list(path.glob("ggml-model*.bin*"))
|
1211 |
if not files:
|
1212 |
raise Exception(f"Can't find model in directory {path}")
|
1213 |
if len(files) > 1:
|
@@ -1215,7 +1063,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
|
1215 |
path = files[0]
|
1216 |
|
1217 |
paths = find_multifile_paths(path)
|
1218 |
-
models_plus:
|
1219 |
for path in paths:
|
1220 |
print(f"Loading model file {path}")
|
1221 |
models_plus.append(lazy_load_file(path))
|
@@ -1224,19 +1072,14 @@ def load_some_model(path: Path) -> ModelPlus:
|
|
1224 |
return model_plus
|
1225 |
|
1226 |
|
1227 |
-
def
|
1228 |
-
return {name: model[name] for name in TENSORS_LIST if name in model}
|
1229 |
-
|
1230 |
-
|
1231 |
-
def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
|
1232 |
-
print(f"vocabtype: {vocabtype}")
|
1233 |
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
1234 |
# a directory, it might be the model directory, and tokenizer.model might
|
1235 |
# be in the parent of that.
|
1236 |
if path.is_dir():
|
1237 |
vocab_file = "tokenizer.model"
|
1238 |
if vocabtype == 'bpe':
|
1239 |
-
|
1240 |
path2 = path / vocab_file
|
1241 |
# Use `.parent` instead of /.. to handle the symlink case better.
|
1242 |
path3 = path.parent / vocab_file
|
@@ -1246,23 +1089,27 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
|
|
1246 |
path = path3
|
1247 |
else:
|
1248 |
raise FileNotFoundError(
|
1249 |
-
f"Could not find
|
1250 |
"if it's in another directory, pass the directory as --vocab-dir")
|
|
|
|
|
|
|
1251 |
added_tokens_path = path.parent / "added_tokens.json"
|
1252 |
-
|
1253 |
-
|
1254 |
-
|
|
|
|
|
|
|
1255 |
|
1256 |
|
1257 |
-
def default_outfile(model_paths:
|
1258 |
namestr = {
|
1259 |
-
GGMLFileType.AllF32:
|
1260 |
GGMLFileType.MostlyF16: "f16",
|
1261 |
-
GGMLFileType.
|
1262 |
-
GGMLFileType.MostlyQ4_1: "q4_1",
|
1263 |
-
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
1264 |
}[file_type]
|
1265 |
-
ret = model_paths[0].parent / f"ggml-model-{namestr}.
|
1266 |
if ret in model_paths:
|
1267 |
sys.stderr.write(
|
1268 |
f"Error: Default output path ({ret}) would overwrite the input. "
|
@@ -1279,47 +1126,82 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
|
1279 |
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
|
1280 |
|
1281 |
|
1282 |
-
def main(args_in:
|
1283 |
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
1284 |
-
parser.add_argument("--dump",
|
1285 |
-
parser.add_argument("--dump-single", action="store_true",
|
1286 |
-
parser.add_argument("--vocab-only",
|
1287 |
-
parser.add_argument("--outtype",
|
1288 |
-
parser.add_argument("--vocab-dir",
|
1289 |
-
parser.add_argument("--outfile",
|
1290 |
-
parser.add_argument("model",
|
1291 |
-
|
1292 |
-
parser.add_argument("--
|
|
|
1293 |
args = parser.parse_args(args_in)
|
1294 |
|
1295 |
-
vocab: Vocab
|
1296 |
if args.dump_single:
|
1297 |
model_plus = lazy_load_file(args.model)
|
1298 |
do_dump_model(model_plus)
|
1299 |
-
|
1300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1301 |
assert args.outfile, "need --outfile if using --vocab-only"
|
|
|
|
|
|
|
1302 |
outfile = args.outfile
|
1303 |
-
OutputFile.write_vocab_only(outfile, vocab)
|
1304 |
print(f"Wrote {outfile}")
|
|
|
|
|
|
|
|
|
1305 |
else:
|
1306 |
-
|
1307 |
-
|
1308 |
-
|
1309 |
-
|
1310 |
-
|
1311 |
-
|
1312 |
-
|
1313 |
-
|
1314 |
-
|
1315 |
-
|
1316 |
-
|
1317 |
-
|
1318 |
-
|
1319 |
-
|
1320 |
-
|
1321 |
-
|
1322 |
-
print(f"Wrote {outfile}")
|
1323 |
|
1324 |
|
1325 |
if __name__ == '__main__':
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
import argparse
|
5 |
import concurrent.futures
|
6 |
import copy
|
|
|
17 |
import signal
|
18 |
import struct
|
19 |
import sys
|
20 |
+
import time
|
21 |
import zipfile
|
22 |
from abc import ABCMeta, abstractmethod
|
23 |
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
24 |
from dataclasses import dataclass
|
25 |
from pathlib import Path
|
26 |
+
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
|
|
|
27 |
|
28 |
import numpy as np
|
29 |
+
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
30 |
+
|
31 |
+
import os
|
32 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
33 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
34 |
+
import gguf
|
35 |
|
36 |
if TYPE_CHECKING:
|
37 |
+
from typing import TypeAlias
|
38 |
|
39 |
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
40 |
faulthandler.register(signal.SIGUSR1)
|
41 |
|
42 |
+
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
43 |
|
44 |
+
ARCH=gguf.MODEL_ARCH.LLAMA
|
45 |
+
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
|
46 |
+
|
47 |
+
DEFAULT_CONCURRENCY = 8
|
48 |
+
#
|
49 |
+
# data types
|
50 |
+
#
|
51 |
|
52 |
@dataclass(frozen=True)
|
53 |
+
class DataType:
|
54 |
name: str
|
55 |
+
dtype: np.dtype[Any]
|
56 |
+
valid_conversions: list[str]
|
57 |
|
58 |
+
def elements_to_bytes(self, n_elements: int) -> int:
|
59 |
+
return n_elements * self.dtype.itemsize
|
|
|
|
|
|
|
|
|
60 |
|
61 |
@dataclass(frozen=True)
|
62 |
+
class UnquantizedDataType(DataType):
|
63 |
+
pass
|
|
|
|
|
|
|
64 |
|
65 |
+
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
66 |
+
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
67 |
+
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
68 |
+
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
69 |
|
70 |
+
@dataclass(frozen=True)
|
71 |
+
class QuantizedDataType(DataType):
|
72 |
+
block_size: int
|
73 |
+
quantized_dtype: np.dtype[Any]
|
74 |
+
ggml_type: gguf.GGMLQuantizationType
|
75 |
|
76 |
+
def quantize(self, arr: NDArray) -> NDArray:
|
77 |
+
raise NotImplementedError(f'Quantization for {self.name} not implemented')
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
def elements_to_bytes(self, n_elements: int) -> int:
|
80 |
+
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
81 |
+
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
82 |
|
83 |
+
@dataclass(frozen=True)
|
84 |
+
class Q8_0QuantizedDataType(QuantizedDataType):
|
85 |
+
# Mini Q8_0 quantization in Python!
|
86 |
+
def quantize(self, arr: NDArray) -> NDArray:
|
87 |
+
assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
|
88 |
+
assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
|
89 |
+
n_blocks = arr.size // self.block_size
|
90 |
+
blocks = arr.reshape((n_blocks, self.block_size))
|
91 |
+
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
92 |
+
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
93 |
+
d = abs(blocks).max(axis = 1) / np.float32(127)
|
94 |
+
with np.errstate(divide = 'ignore'):
|
95 |
+
qs = (blocks / d[:, None]).round()
|
96 |
+
qs[d == 0] = 0
|
97 |
+
yield from zip(d, qs)
|
98 |
+
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
99 |
+
|
100 |
+
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
101 |
+
dtype = np.dtype(np.float32), valid_conversions = [],
|
102 |
+
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
103 |
+
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
104 |
+
|
105 |
+
# Quantized types skipped here because they may also map to np.float32
|
106 |
+
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
107 |
+
for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
|
108 |
+
if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
|
109 |
+
raise ValueError(f'Invalid duplicate data type {dt}')
|
110 |
+
NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
|
111 |
+
|
112 |
+
SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
113 |
+
'BF16': DT_BF16,
|
114 |
+
'F16': DT_F16,
|
115 |
+
'F32': DT_F32,
|
116 |
+
'I32': DT_I32,
|
117 |
}
|
118 |
|
119 |
+
# TODO: match this with `llama_ftype`
|
120 |
+
# TODO: rename to LLAMAFileType
|
121 |
+
# TODO: move to `gguf.py`
|
122 |
+
class GGMLFileType(enum.IntEnum):
|
123 |
+
AllF32 = 0
|
124 |
+
MostlyF16 = 1 # except 1d tensors
|
125 |
+
MostlyQ8_0 = 7 # except 1d tensors
|
126 |
+
|
127 |
+
def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
|
128 |
+
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
129 |
+
if dt is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
raise ValueError(self)
|
131 |
+
# 1D tensors are always F32.
|
132 |
+
return dt if len(tensor.shape) > 1 else DT_F32
|
133 |
|
134 |
+
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
135 |
+
GGMLFileType.AllF32 : DT_F32,
|
136 |
+
GGMLFileType.MostlyF16 : DT_F16,
|
137 |
+
GGMLFileType.MostlyQ8_0: DT_Q8_0,
|
138 |
+
}
|
139 |
|
140 |
+
#
|
141 |
+
# hparams loading
|
142 |
+
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
+
@dataclass
|
145 |
+
class Params:
|
146 |
+
n_vocab: int
|
147 |
+
n_embd: int
|
148 |
+
n_layer: int
|
149 |
+
n_ctx: int
|
150 |
+
n_ff: int
|
151 |
+
n_head: int
|
152 |
+
n_head_kv: int
|
153 |
+
f_norm_eps: float
|
154 |
|
155 |
+
f_rope_freq_base: float | None = None
|
156 |
+
f_rope_scale: float | None = None
|
157 |
|
158 |
+
ftype: GGMLFileType | None = None
|
159 |
|
160 |
+
# path to the directory containing the model files
|
161 |
+
path_model: Path | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
@staticmethod
|
164 |
+
def guessed(model: LazyModel) -> Params:
|
165 |
# try transformer naming first
|
166 |
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
167 |
|
|
|
177 |
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
178 |
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
179 |
|
180 |
+
n_head = n_embd // 128 # guessed
|
181 |
+
n_mult = 256 # guessed
|
182 |
+
|
183 |
+
# TODO: verify this
|
184 |
+
n_ff = int(2 * (4 * n_embd) / 3)
|
185 |
+
n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
|
186 |
|
187 |
return Params(
|
188 |
+
n_vocab = n_vocab,
|
189 |
+
n_embd = n_embd,
|
190 |
+
n_layer = n_layer,
|
191 |
+
n_ctx = -1,
|
192 |
+
n_ff = n_ff,
|
193 |
+
n_head = n_head,
|
194 |
+
n_head_kv = n_head,
|
195 |
+
f_norm_eps = 1e-5,
|
196 |
)
|
197 |
|
198 |
@staticmethod
|
199 |
+
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
200 |
config = json.load(open(config_path))
|
201 |
|
202 |
+
n_vocab = config["vocab_size"]
|
203 |
+
n_embd = config["hidden_size"]
|
204 |
+
n_layer = config["num_hidden_layers"]
|
205 |
+
n_ff = config["intermediate_size"]
|
206 |
+
n_head = config["num_attention_heads"]
|
207 |
+
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
208 |
+
f_norm_eps = config["rms_norm_eps"]
|
209 |
+
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
210 |
+
|
211 |
+
rope_scaling = config.get("rope_scaling")
|
212 |
+
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
|
213 |
+
f_rope_scale = config["rope_scaling"].get("factor")
|
214 |
+
else:
|
215 |
+
f_rope_scale = None
|
216 |
|
217 |
+
if "max_sequence_length" in config:
|
218 |
+
n_ctx = config["max_sequence_length"]
|
219 |
+
elif "max_position_embeddings" in config:
|
220 |
+
n_ctx = config["max_position_embeddings"]
|
221 |
+
else:
|
222 |
+
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
223 |
+
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
224 |
|
225 |
return Params(
|
226 |
+
n_vocab = n_vocab,
|
227 |
+
n_embd = n_embd,
|
228 |
+
n_layer = n_layer,
|
229 |
+
n_ctx = n_ctx,
|
230 |
+
n_ff = n_ff,
|
231 |
+
n_head = n_head,
|
232 |
+
n_head_kv = n_head_kv,
|
233 |
+
f_norm_eps = f_norm_eps,
|
234 |
+
f_rope_freq_base = f_rope_freq_base,
|
235 |
+
f_rope_scale = f_rope_scale,
|
236 |
)
|
237 |
|
238 |
# LLaMA v2 70B params.json
|
239 |
+
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
240 |
@staticmethod
|
241 |
+
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
242 |
config = json.load(open(config_path))
|
243 |
|
244 |
+
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
245 |
+
n_embd = config["dim"]
|
246 |
+
n_layer = config["n_layers"]
|
247 |
+
n_ff = -1
|
248 |
+
n_head = config["n_heads"]
|
249 |
+
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
250 |
+
f_norm_eps = config["norm_eps"]
|
251 |
+
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
252 |
+
|
253 |
+
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
254 |
+
if f_rope_freq_base == 1000000:
|
255 |
+
# CodeLlama
|
256 |
+
n_ctx = 16384
|
257 |
+
elif config["norm_eps"] == 1e-05:
|
258 |
+
# LLaMA v2
|
259 |
+
n_ctx = 4096
|
260 |
+
else:
|
261 |
+
# LLaMA v1
|
262 |
+
n_ctx = 2048
|
263 |
|
264 |
if n_vocab == -1:
|
265 |
n_vocab = model["tok_embeddings.weight"].shape[0]
|
266 |
|
267 |
+
if n_ff == -1:
|
268 |
+
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
|
269 |
+
|
270 |
return Params(
|
271 |
+
n_vocab = n_vocab,
|
272 |
+
n_embd = n_embd,
|
273 |
+
n_layer = n_layer,
|
274 |
+
n_ctx = n_ctx,
|
275 |
+
n_ff = n_ff,
|
276 |
+
n_head = n_head,
|
277 |
+
n_head_kv = n_head_kv,
|
278 |
+
f_norm_eps = f_norm_eps,
|
279 |
+
f_rope_freq_base = f_rope_freq_base,
|
280 |
)
|
281 |
|
282 |
@staticmethod
|
283 |
+
def load(model_plus: ModelPlus) -> Params:
|
284 |
hf_config_path = model_plus.paths[0].parent / "config.json"
|
285 |
orig_config_path = model_plus.paths[0].parent / "params.json"
|
286 |
|
|
|
288 |
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
|
289 |
elif orig_config_path.exists():
|
290 |
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
|
291 |
+
elif model_plus.format != 'none':
|
292 |
params = Params.guessed(model_plus.model)
|
293 |
+
else:
|
294 |
+
raise ValueError('Cannot guess params when model format is none')
|
295 |
+
|
296 |
+
params.path_model = model_plus.paths[0].parent
|
297 |
|
|
|
298 |
return params
|
299 |
|
300 |
|
301 |
+
#
|
302 |
+
# vocab
|
303 |
+
#
|
304 |
+
|
305 |
+
class BpeVocab:
|
306 |
+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
307 |
+
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
308 |
+
added_tokens: dict[str, int]
|
309 |
+
if fname_added_tokens is not None:
|
310 |
+
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
311 |
+
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
312 |
else:
|
313 |
+
# Fall back to trying to find the added tokens in tokenizer.json
|
314 |
+
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
315 |
+
if not tokenizer_json_file.is_file():
|
316 |
+
added_tokens = {}
|
317 |
+
else:
|
318 |
+
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
319 |
+
added_tokens = dict(
|
320 |
+
(item['content'], item['id'])
|
321 |
+
for item in tokenizer_json.get('added_tokens', [])
|
322 |
+
# Added tokens here can be duplicates of the main vocabulary.
|
323 |
+
if item['content'] not in self.bpe_tokenizer )
|
324 |
+
|
325 |
+
vocab_size: int = len(self.bpe_tokenizer)
|
326 |
+
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
327 |
+
actual_ids = sorted(added_tokens.values())
|
328 |
+
if expected_ids != actual_ids:
|
329 |
+
expected_end_id = vocab_size + len(actual_ids) - 1
|
330 |
+
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
331 |
+
|
332 |
+
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
333 |
+
self.added_tokens_list = [text for (text, idx) in items]
|
334 |
+
self.vocab_size_base: int = vocab_size
|
335 |
+
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
336 |
+
self.fname_tokenizer = fname_tokenizer
|
337 |
+
self.fname_added_tokens = fname_added_tokens
|
338 |
+
|
339 |
+
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
340 |
+
tokenizer = self.bpe_tokenizer
|
341 |
+
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
342 |
+
byte_encoder = tokenization_gpt2.bytes_to_unicode()
|
343 |
+
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
344 |
+
score = 0.0
|
345 |
+
for i, item in enumerate(tokenizer):
|
346 |
+
text: bytes = item.encode("utf-8")
|
347 |
+
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
|
348 |
+
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
|
349 |
+
if i == 0 and text == b'<unk>':
|
350 |
+
toktype = gguf.TokenType.UNKNOWN
|
351 |
+
elif i == 1 or i == 2:
|
352 |
+
toktype = gguf.TokenType.CONTROL
|
353 |
+
elif i >= 3 and text.startswith(b'<0x'):
|
354 |
+
toktype = gguf.TokenType.BYTE
|
355 |
+
else:
|
356 |
+
toktype = gguf.TokenType.NORMAL
|
357 |
+
else:
|
358 |
+
toktype = gguf.TokenType.NORMAL
|
359 |
+
yield text, score, toktype
|
360 |
+
|
361 |
+
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
362 |
+
for text in self.added_tokens_list:
|
363 |
+
score = -1000.0
|
364 |
+
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
365 |
+
|
366 |
+
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
367 |
+
yield from self.bpe_tokens()
|
368 |
+
yield from self.added_tokens()
|
369 |
+
|
370 |
+
def __repr__(self) -> str:
|
371 |
+
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
372 |
+
|
373 |
+
|
374 |
+
class SentencePieceVocab:
|
375 |
+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
376 |
+
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
377 |
+
added_tokens: dict[str, int]
|
378 |
if fname_added_tokens is not None:
|
379 |
+
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
380 |
else:
|
381 |
added_tokens = {}
|
382 |
+
|
383 |
+
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
|
|
|
|
384 |
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
385 |
+
actual_ids = sorted(added_tokens.values())
|
386 |
if expected_ids != actual_ids:
|
387 |
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
|
388 |
+
|
389 |
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
390 |
self.added_tokens_list = [text for (text, idx) in items]
|
391 |
self.vocab_size_base: int = vocab_size
|
|
|
393 |
self.fname_tokenizer = fname_tokenizer
|
394 |
self.fname_added_tokens = fname_added_tokens
|
395 |
|
396 |
+
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
397 |
tokenizer = self.sentencepiece_tokenizer
|
398 |
+
for i in range(tokenizer.vocab_size()):
|
399 |
+
piece = tokenizer.id_to_piece(i)
|
400 |
+
text: bytes = piece.encode("utf-8")
|
401 |
+
score: float = tokenizer.get_score(i)
|
402 |
+
|
403 |
+
toktype = gguf.TokenType.NORMAL
|
404 |
+
if tokenizer.is_unknown(i):
|
405 |
+
toktype = gguf.TokenType.UNKNOWN
|
406 |
+
if tokenizer.is_control(i):
|
407 |
+
toktype = gguf.TokenType.CONTROL
|
408 |
+
|
409 |
+
# NOTE: I think added_tokens are user defined.
|
410 |
+
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
411 |
+
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
412 |
+
|
413 |
+
if tokenizer.is_unused(i):
|
414 |
+
toktype = gguf.TokenType.UNUSED
|
415 |
+
if tokenizer.is_byte(i):
|
416 |
+
toktype = gguf.TokenType.BYTE
|
417 |
+
|
418 |
+
yield text, score, toktype
|
419 |
+
|
420 |
+
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
|
|
|
|
|
|
|
|
|
421 |
for text in self.added_tokens_list:
|
422 |
score = -1000.0
|
423 |
+
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
424 |
|
425 |
+
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
426 |
yield from self.sentencepiece_tokens()
|
427 |
yield from self.added_tokens()
|
428 |
|
429 |
def __repr__(self) -> str:
|
430 |
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
431 |
|
432 |
+
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
433 |
|
434 |
+
#
|
435 |
+
# data loading
|
436 |
+
# TODO: reuse (probably move to gguf.py?)
|
437 |
+
#
|
438 |
|
439 |
+
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
440 |
+
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
441 |
+
if n_head_kv is not None and n_head != n_head_kv:
|
442 |
+
n_head //= n_head_kv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
444 |
.swapaxes(1, 2)
|
445 |
.reshape(weights.shape))
|
446 |
|
447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
class Tensor(metaclass=ABCMeta):
|
449 |
data_type: DataType
|
450 |
|
451 |
@abstractmethod
|
452 |
+
def astype(self, data_type: DataType) -> Tensor: ...
|
453 |
@abstractmethod
|
454 |
+
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
|
455 |
@abstractmethod
|
456 |
+
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
|
457 |
@abstractmethod
|
458 |
+
def part(self, n_part: int) -> UnquantizedTensor: ...
|
459 |
@abstractmethod
|
460 |
+
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
461 |
|
462 |
|
463 |
+
def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
|
464 |
assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
|
465 |
fp32_arr = bf16_arr.astype(np.uint32) << 16
|
466 |
return fp32_arr.view(np.float32)
|
|
|
473 |
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
474 |
|
475 |
def astype(self, data_type: DataType) -> Tensor:
|
476 |
+
dtype = data_type.dtype
|
477 |
if self.data_type == DT_BF16:
|
478 |
self.ndarray = bf16_to_fp32(self.ndarray)
|
479 |
return UnquantizedTensor(self.ndarray.astype(dtype))
|
480 |
|
481 |
+
def to_ggml(self) -> UnquantizedTensor:
|
482 |
return self
|
483 |
|
484 |
+
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
485 |
r = self.ndarray.shape[0] // 3
|
486 |
+
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
487 |
|
488 |
+
def part(self, n_part: int) -> UnquantizedTensor:
|
489 |
r = self.ndarray.shape[0] // 3
|
490 |
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
|
491 |
|
492 |
+
def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
493 |
+
return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
|
494 |
|
495 |
|
496 |
+
def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
|
497 |
tensor = lazy_tensor.load()
|
498 |
assert isinstance(tensor, UnquantizedTensor)
|
499 |
|
|
|
509 |
return tensor.ndarray
|
510 |
|
511 |
|
512 |
+
GGMLCompatibleTensor = UnquantizedTensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
|
514 |
|
515 |
@dataclass
|
516 |
class LazyTensor:
|
517 |
_load: Callable[[], Tensor]
|
518 |
+
shape: list[int]
|
519 |
data_type: DataType
|
520 |
description: str
|
521 |
|
522 |
def load(self) -> Tensor:
|
523 |
ret = self._load()
|
524 |
+
# Should be okay if it maps to the same numpy type?
|
525 |
+
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
526 |
+
(self.data_type, ret.data_type, self.description)
|
527 |
return ret
|
528 |
|
529 |
+
def astype(self, data_type: DataType) -> LazyTensor:
|
530 |
self.validate_conversion_to(data_type)
|
531 |
|
532 |
def load() -> Tensor:
|
|
|
534 |
return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
|
535 |
|
536 |
def validate_conversion_to(self, data_type: DataType) -> None:
|
537 |
+
if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
|
538 |
+
raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
|
540 |
|
541 |
+
LazyModel: TypeAlias = 'dict[str, LazyTensor]'
|
542 |
|
543 |
|
544 |
@dataclass
|
545 |
class ModelPlus:
|
546 |
model: LazyModel
|
547 |
+
paths: list[Path] # Where this was read from.
|
548 |
+
format: Literal['ggml', 'torch', 'safetensors', 'none']
|
549 |
+
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
|
550 |
|
551 |
|
552 |
+
def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
553 |
# Original LLaMA models have each file contain one part of each tensor.
|
554 |
# Use a dict instead of a set to preserve order.
|
555 |
names = {name: None for model in models for name in model}
|
556 |
|
557 |
def convert(name: str) -> LazyTensor:
|
558 |
+
lazy_tensors: list[LazyTensor] = [model[name] for model in models]
|
559 |
if len(lazy_tensors) == 1:
|
560 |
# only one file; don't go through this procedure since there might
|
561 |
# be quantized tensors
|
|
|
583 |
return {name: convert(name) for name in names}
|
584 |
|
585 |
|
586 |
+
def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
|
587 |
formats = set(mp.format for mp in models_plus)
|
588 |
assert len(formats) == 1, "different formats?"
|
589 |
format = formats.pop()
|
|
|
606 |
return ModelPlus(model, paths, format, vocab)
|
607 |
|
608 |
|
609 |
+
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
|
610 |
def load() -> Tensor:
|
611 |
+
return lazy_tensor.load().permute(n_head, n_head_kv)
|
612 |
+
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
613 |
|
614 |
+
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
615 |
def load() -> Tensor:
|
616 |
+
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
617 |
s = lazy_tensor.shape.copy()
|
618 |
s[0] = s[0] // 3
|
619 |
+
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
620 |
|
621 |
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
622 |
def load() -> Tensor:
|
|
|
625 |
s[0] = s[0] // 3
|
626 |
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
|
627 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
628 |
|
629 |
# Functionality that simulates `torch.load` but where individual tensors are
|
630 |
# only loaded into memory on demand, not all at once.
|
|
|
657 |
assert isinstance(pid[1], LazyStorageKind)
|
658 |
data_type = pid[1].data_type
|
659 |
filename_stem = pid[2]
|
660 |
+
filename = f'{self.data_base_path}/{filename_stem}'
|
661 |
info = self.zip_file.getinfo(filename)
|
662 |
|
663 |
def load(offset: int, elm_count: int) -> NDArray:
|
664 |
+
dtype = data_type.dtype
|
|
|
|
|
665 |
fp = self.zip_file.open(info)
|
666 |
fp.seek(offset * dtype.itemsize)
|
667 |
size = elm_count * dtype.itemsize
|
|
|
671 |
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
672 |
return LazyStorage(load=load, kind=pid[1], description=description)
|
673 |
|
674 |
+
@staticmethod
|
675 |
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
|
|
|
676 |
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
677 |
assert isinstance(storage, LazyStorage)
|
678 |
|
|
|
682 |
description = f'pickled storage_offset={storage_offset} in {storage.description}'
|
683 |
return LazyTensor(load, list(size), storage.kind.data_type, description)
|
684 |
|
685 |
+
@staticmethod
|
686 |
def rebuild_from_type_v2(func, new_type, args, state):
|
687 |
return func(*args)
|
688 |
|
689 |
+
CLASSES: dict[tuple[str, str], Any] = {
|
690 |
+
# getattr used here as a workaround for mypy not being smart enough to detrmine
|
691 |
+
# the staticmethods have a __func__ attribute.
|
692 |
+
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
693 |
+
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
694 |
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
|
695 |
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
|
696 |
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
|
|
|
717 |
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
|
718 |
|
719 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
720 |
def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
721 |
header_size, = struct.unpack('<Q', fp.read(8))
|
722 |
+
header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
|
723 |
# Use mmap for the actual data to avoid race conditions with the file offset.
|
724 |
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
|
725 |
byte_buf = mapped[8 + header_size:]
|
726 |
|
727 |
+
def convert(info: dict[str, Any]) -> LazyTensor:
|
728 |
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
|
729 |
+
numpy_dtype = data_type.dtype
|
730 |
+
shape: list[int] = info['shape']
|
731 |
begin, end = info['data_offsets']
|
732 |
assert 0 <= begin <= end <= len(byte_buf)
|
733 |
assert end - begin == math.prod(shape) * numpy_dtype.itemsize
|
|
|
748 |
return ret
|
749 |
|
750 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
751 |
@functools.lru_cache(maxsize=None)
|
752 |
def lazy_load_file(path: Path) -> ModelPlus:
|
753 |
fp = open(path, 'rb')
|
|
|
756 |
if first8[:2] == b'PK':
|
757 |
# A zip file, i.e. PyTorch format
|
758 |
return lazy_load_torch_file(fp, path)
|
|
|
|
|
|
|
759 |
elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
|
760 |
# Probably safetensors
|
761 |
return lazy_load_safetensors_file(fp, path)
|
|
|
766 |
In = TypeVar('In')
|
767 |
Out = TypeVar('Out')
|
768 |
|
769 |
+
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
|
|
770 |
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
771 |
fast enough, this will stop calling `func` at some point rather than
|
772 |
letting results pile up in memory. Specifically, there is a max of one
|
773 |
output value buffered per thread.'''
|
774 |
+
if concurrency < 2:
|
775 |
+
yield from map(func, iterable)
|
776 |
+
# Not reached.
|
777 |
+
iterable = iter(iterable)
|
778 |
+
executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
|
779 |
+
if use_processpool_executor:
|
780 |
+
executor_class = ProcessPoolExecutor
|
781 |
+
else:
|
782 |
+
executor_class = ThreadPoolExecutor
|
783 |
+
with executor_class(max_workers = max_workers) as executor:
|
784 |
+
futures: list[concurrent.futures.Future[Out]] = []
|
785 |
+
done = False
|
786 |
+
for _ in range(concurrency):
|
787 |
+
try:
|
788 |
+
futures.append(executor.submit(func, next(iterable)))
|
789 |
+
except StopIteration:
|
790 |
+
done = True
|
791 |
+
break
|
792 |
+
|
793 |
while futures:
|
794 |
result = futures.pop(0).result()
|
795 |
+
while not done and len(futures) < concurrency:
|
796 |
+
try:
|
797 |
+
futures.append(executor.submit(func, next(iterable)))
|
798 |
+
except StopIteration:
|
799 |
+
done = True
|
800 |
+
break
|
801 |
yield result
|
802 |
|
|
|
803 |
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
804 |
if params.n_vocab != vocab.vocab_size:
|
805 |
+
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
|
|
806 |
if params.n_vocab == vocab.vocab_size_base:
|
807 |
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
808 |
vocab.added_tokens_list = []
|
|
|
819 |
|
820 |
class OutputFile:
|
821 |
def __init__(self, fname_out: Path) -> None:
|
822 |
+
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
823 |
+
|
824 |
+
def add_meta_arch(self, params: Params) -> None:
|
825 |
+
name = "LLaMA"
|
826 |
+
|
827 |
+
# TODO: better logic to determine model name
|
828 |
+
if params.n_ctx == 4096:
|
829 |
+
name = "LLaMA v2"
|
830 |
+
elif params.path_model is not None:
|
831 |
+
name = str(params.path_model.parent).split('/')[-1]
|
832 |
+
|
833 |
+
self.gguf.add_name (name)
|
834 |
+
self.gguf.add_context_length (params.n_ctx)
|
835 |
+
self.gguf.add_embedding_length (params.n_embd)
|
836 |
+
self.gguf.add_block_count (params.n_layer)
|
837 |
+
self.gguf.add_feed_forward_length (params.n_ff)
|
838 |
+
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
839 |
+
self.gguf.add_head_count (params.n_head)
|
840 |
+
self.gguf.add_head_count_kv (params.n_head_kv)
|
841 |
+
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
|
842 |
+
|
843 |
+
if params.f_rope_freq_base is not None:
|
844 |
+
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
845 |
+
|
846 |
+
if params.f_rope_scale is not None:
|
847 |
+
self.gguf.add_rope_scale_linear(params.f_rope_scale)
|
848 |
+
|
849 |
+
if params.ftype is not None:
|
850 |
+
self.gguf.add_file_type(params.ftype)
|
851 |
+
|
852 |
+
def add_meta_vocab(self, vocab: Vocab) -> None:
|
853 |
+
tokens = []
|
854 |
+
scores = []
|
855 |
+
toktypes = []
|
856 |
+
# NOTE: `all_tokens` returns the base vocabulary and added tokens
|
857 |
+
for text, score, toktype in vocab.all_tokens():
|
858 |
+
tokens.append(text)
|
859 |
+
scores.append(score)
|
860 |
+
toktypes.append(toktype)
|
861 |
+
|
862 |
+
if isinstance(vocab, SentencePieceVocab):
|
863 |
+
self.gguf.add_tokenizer_model("llama")
|
864 |
+
elif isinstance(vocab, BpeVocab):
|
865 |
+
self.gguf.add_tokenizer_model("gpt2")
|
866 |
+
else:
|
867 |
+
raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
|
868 |
+
self.gguf.add_token_list(tokens)
|
869 |
+
self.gguf.add_token_scores(scores)
|
870 |
+
self.gguf.add_token_types(toktypes)
|
871 |
+
|
872 |
+
def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
|
873 |
+
svocab.add_to_gguf(self.gguf)
|
874 |
+
|
875 |
+
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
876 |
+
n_elements = int(np.prod(tensor.shape))
|
877 |
+
raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
|
878 |
+
data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
|
879 |
+
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
|
880 |
+
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
|
881 |
+
|
882 |
+
def write_meta(self) -> None:
|
883 |
+
self.gguf.write_header_to_file()
|
884 |
+
self.gguf.write_kv_data_to_file()
|
885 |
+
|
886 |
+
def write_tensor_info(self) -> None:
|
887 |
+
self.gguf.write_ti_data_to_file()
|
888 |
+
|
889 |
+
def close(self) -> None:
|
890 |
+
self.gguf.close()
|
891 |
|
892 |
@staticmethod
|
893 |
+
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
|
894 |
+
check_vocab_size(params, vocab)
|
895 |
+
|
896 |
of = OutputFile(fname_out)
|
897 |
+
|
898 |
+
# meta data
|
899 |
+
of.add_meta_arch(params)
|
900 |
+
of.add_meta_vocab(vocab)
|
901 |
+
of.add_meta_special_vocab(svocab)
|
902 |
+
|
903 |
+
of.write_meta()
|
904 |
+
|
905 |
+
of.close()
|
906 |
+
|
907 |
+
@staticmethod
|
908 |
+
def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
|
909 |
+
name, lazy_tensor = item
|
910 |
+
tensor = lazy_tensor.load().to_ggml()
|
911 |
+
return (lazy_tensor.data_type, tensor.ndarray)
|
912 |
|
913 |
@staticmethod
|
914 |
+
def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
|
915 |
+
dt, arr = item
|
916 |
+
if not isinstance(dt, QuantizedDataType):
|
917 |
+
return arr
|
918 |
+
return dt.quantize(arr)
|
919 |
+
|
920 |
+
@staticmethod
|
921 |
+
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
|
922 |
check_vocab_size(params, vocab)
|
923 |
+
|
924 |
of = OutputFile(fname_out)
|
|
|
|
|
|
|
925 |
|
926 |
+
# meta data
|
927 |
+
of.add_meta_arch(params)
|
928 |
+
of.add_meta_vocab(vocab)
|
929 |
+
of.add_meta_special_vocab(svocab)
|
930 |
+
|
931 |
+
# tensor info
|
932 |
+
for name, lazy_tensor in model.items():
|
933 |
+
of.add_tensor_info(name, lazy_tensor)
|
934 |
+
|
935 |
+
of.write_meta()
|
936 |
+
of.write_tensor_info()
|
937 |
+
|
938 |
+
# tensor data
|
939 |
+
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
|
940 |
+
if ftype == GGMLFileType.MostlyQ8_0:
|
941 |
+
ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
|
942 |
+
else:
|
943 |
+
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
944 |
|
945 |
+
start = time.time()
|
946 |
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
947 |
+
elapsed = time.time() - start
|
948 |
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
949 |
padi = len(str(len(model)))
|
950 |
+
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
|
951 |
+
of.gguf.write_tensor_data(ndarray)
|
|
|
|
|
952 |
|
953 |
+
of.close()
|
954 |
|
955 |
+
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
956 |
+
wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
957 |
+
|
958 |
+
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
959 |
return GGMLFileType.AllF32
|
960 |
+
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
|
961 |
return GGMLFileType.MostlyF16
|
962 |
+
if output_type_str == "q8_0":
|
963 |
+
return GGMLFileType.MostlyQ8_0
|
964 |
+
|
|
|
|
|
|
|
|
|
|
|
965 |
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
966 |
+
|
967 |
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
968 |
|
969 |
+
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
970 |
+
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
971 |
+
for (name, tensor) in model.items()}
|
972 |
|
973 |
+
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
974 |
+
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
975 |
+
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
976 |
|
977 |
+
tmp = model
|
|
|
|
|
978 |
|
979 |
+
# HF models permut or pack some of the tensors, so we need to undo that
|
980 |
+
for i in itertools.count():
|
981 |
+
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
982 |
+
print(f"Permuting layer {i}")
|
983 |
+
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
984 |
+
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
985 |
+
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
986 |
+
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
987 |
+
print(f"Unpacking and permuting layer {i}")
|
988 |
+
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
989 |
+
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
|
990 |
+
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
|
991 |
+
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
992 |
+
else:
|
993 |
+
break
|
994 |
|
995 |
+
out: LazyModel = {}
|
996 |
+
for name, lazy_tensor in model.items():
|
997 |
+
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
|
998 |
+
if name_new is None:
|
999 |
+
raise Exception(f"Unexpected tensor name: {name}")
|
1000 |
|
1001 |
+
if tensor_type in should_skip:
|
1002 |
+
print(f"skipping tensor {name_new}")
|
1003 |
+
continue
|
1004 |
|
1005 |
+
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
|
1006 |
+
out[name_new] = lazy_tensor
|
1007 |
|
1008 |
+
return out
|
1009 |
+
|
1010 |
+
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
1011 |
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
1012 |
the nth path in the model.
|
1013 |
'''
|
1014 |
# Support the following patterns:
|
1015 |
+
patterns: list[tuple[str, str]] = [
|
1016 |
# - x.00.pth, x.01.pth, etc.
|
1017 |
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
1018 |
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
|
|
1028 |
return None
|
1029 |
|
1030 |
|
1031 |
+
def find_multifile_paths(path: Path) -> list[Path]:
|
1032 |
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
1033 |
the whole list of paths in the model.
|
1034 |
'''
|
1035 |
+
ret: list[Path] = []
|
1036 |
for i in itertools.count():
|
1037 |
nth_path = nth_multifile_path(path, i)
|
1038 |
if nth_path is None:
|
|
|
1056 |
# Try the PyTorch patterns too, with lower priority
|
1057 |
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
1058 |
files = [file for glob in globs for file in path.glob(glob)]
|
|
|
|
|
|
|
|
|
|
|
1059 |
if not files:
|
1060 |
raise Exception(f"Can't find model in directory {path}")
|
1061 |
if len(files) > 1:
|
|
|
1063 |
path = files[0]
|
1064 |
|
1065 |
paths = find_multifile_paths(path)
|
1066 |
+
models_plus: list[ModelPlus] = []
|
1067 |
for path in paths:
|
1068 |
print(f"Loading model file {path}")
|
1069 |
models_plus.append(lazy_load_file(path))
|
|
|
1072 |
return model_plus
|
1073 |
|
1074 |
|
1075 |
+
def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
|
|
|
|
|
|
|
|
|
|
|
1076 |
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
1077 |
# a directory, it might be the model directory, and tokenizer.model might
|
1078 |
# be in the parent of that.
|
1079 |
if path.is_dir():
|
1080 |
vocab_file = "tokenizer.model"
|
1081 |
if vocabtype == 'bpe':
|
1082 |
+
vocab_file = "vocab.json"
|
1083 |
path2 = path / vocab_file
|
1084 |
# Use `.parent` instead of /.. to handle the symlink case better.
|
1085 |
path3 = path.parent / vocab_file
|
|
|
1089 |
path = path3
|
1090 |
else:
|
1091 |
raise FileNotFoundError(
|
1092 |
+
f"Could not find {vocab_file} in {path} or its parent; "
|
1093 |
"if it's in another directory, pass the directory as --vocab-dir")
|
1094 |
+
|
1095 |
+
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
1096 |
+
|
1097 |
added_tokens_path = path.parent / "added_tokens.json"
|
1098 |
+
if vocabtype == "bpe":
|
1099 |
+
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
1100 |
+
elif vocabtype == "spm":
|
1101 |
+
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
1102 |
+
else:
|
1103 |
+
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
1104 |
|
1105 |
|
1106 |
+
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
1107 |
namestr = {
|
1108 |
+
GGMLFileType.AllF32: "f32",
|
1109 |
GGMLFileType.MostlyF16: "f16",
|
1110 |
+
GGMLFileType.MostlyQ8_0:"q8_0",
|
|
|
|
|
1111 |
}[file_type]
|
1112 |
+
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
1113 |
if ret in model_paths:
|
1114 |
sys.stderr.write(
|
1115 |
f"Error: Default output path ({ret}) would overwrite the input. "
|
|
|
1126 |
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
|
1127 |
|
1128 |
|
1129 |
+
def main(args_in: list[str] | None = None) -> None:
|
1130 |
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
1131 |
+
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
1132 |
+
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
1133 |
+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
1134 |
+
parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
1135 |
+
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
1136 |
+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
1137 |
+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
1138 |
+
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
1139 |
+
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
1140 |
+
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
1141 |
args = parser.parse_args(args_in)
|
1142 |
|
|
|
1143 |
if args.dump_single:
|
1144 |
model_plus = lazy_load_file(args.model)
|
1145 |
do_dump_model(model_plus)
|
1146 |
+
return
|
1147 |
+
|
1148 |
+
if not args.vocab_only:
|
1149 |
+
model_plus = load_some_model(args.model)
|
1150 |
+
else:
|
1151 |
+
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
1152 |
+
|
1153 |
+
if args.dump:
|
1154 |
+
do_dump_model(model_plus)
|
1155 |
+
return
|
1156 |
+
|
1157 |
+
params = Params.load(model_plus)
|
1158 |
+
if params.n_ctx == -1:
|
1159 |
+
if args.ctx is None:
|
1160 |
+
raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
|
1161 |
+
"Please specify one with --ctx:\n"
|
1162 |
+
" - LLaMA v1: --ctx 2048\n"
|
1163 |
+
" - LLaMA v2: --ctx 4096\n")
|
1164 |
+
params.n_ctx = args.ctx
|
1165 |
+
|
1166 |
+
if args.outtype:
|
1167 |
+
params.ftype = {
|
1168 |
+
"f32": GGMLFileType.AllF32,
|
1169 |
+
"f16": GGMLFileType.MostlyF16,
|
1170 |
+
"q8_0": GGMLFileType.MostlyQ8_0,
|
1171 |
+
}[args.outtype]
|
1172 |
+
|
1173 |
+
print(f"params = {params}")
|
1174 |
+
|
1175 |
+
vocab: Vocab
|
1176 |
+
if args.vocab_only:
|
1177 |
assert args.outfile, "need --outfile if using --vocab-only"
|
1178 |
+
# FIXME: Try to respect vocab_dir somehow?
|
1179 |
+
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
1180 |
+
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
|
1181 |
outfile = args.outfile
|
1182 |
+
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
1183 |
print(f"Wrote {outfile}")
|
1184 |
+
return
|
1185 |
+
|
1186 |
+
if model_plus.vocab is not None and args.vocab_dir is None:
|
1187 |
+
vocab = model_plus.vocab
|
1188 |
else:
|
1189 |
+
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
1190 |
+
vocab = load_vocab(vocab_dir, args.vocabtype)
|
1191 |
+
# FIXME: Try to respect vocab_dir somehow?
|
1192 |
+
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
|
1193 |
+
|
1194 |
+
model = model_plus.model
|
1195 |
+
model = convert_model_names(model, params)
|
1196 |
+
ftype = pick_output_type(model, args.outtype)
|
1197 |
+
model = convert_to_output_type(model, ftype)
|
1198 |
+
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
1199 |
+
|
1200 |
+
params.ftype = ftype
|
1201 |
+
print(f"Writing {outfile}, format {ftype}")
|
1202 |
+
|
1203 |
+
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
|
1204 |
+
print(f"Wrote {outfile}")
|
|
|
1205 |
|
1206 |
|
1207 |
if __name__ == '__main__':
|
docs/token_generation_performance_tips.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
## Verifying that the model is running on the GPU with cuBLAS
|
4 |
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
5 |
```shell
|
6 |
-
./main -m "path/to/model.
|
7 |
```
|
8 |
|
9 |
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
@@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
|
|
25 |
CPU: 7 physical cores
|
26 |
RAM: 32GB
|
27 |
|
28 |
-
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.
|
29 |
|
30 |
-
Run command: `./main -m "path/to/model.
|
31 |
|
32 |
Result:
|
33 |
|
|
|
3 |
## Verifying that the model is running on the GPU with cuBLAS
|
4 |
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
5 |
```shell
|
6 |
+
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
7 |
```
|
8 |
|
9 |
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
|
|
25 |
CPU: 7 physical cores
|
26 |
RAM: 32GB
|
27 |
|
28 |
+
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
29 |
|
30 |
+
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
31 |
|
32 |
Result:
|
33 |
|
examples/CMakeLists.txt
CHANGED
@@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
|
|
6 |
|
7 |
# ...
|
8 |
|
9 |
-
# common
|
10 |
-
|
11 |
-
set(TARGET common)
|
12 |
-
|
13 |
-
add_library(${TARGET} OBJECT
|
14 |
-
common.h
|
15 |
-
common.cpp
|
16 |
-
console.h
|
17 |
-
console.cpp
|
18 |
-
grammar-parser.h
|
19 |
-
grammar-parser.cpp
|
20 |
-
)
|
21 |
-
|
22 |
-
if (BUILD_SHARED_LIBS)
|
23 |
-
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
24 |
-
endif()
|
25 |
-
|
26 |
-
target_include_directories(${TARGET} PUBLIC .)
|
27 |
-
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
28 |
-
target_link_libraries(${TARGET} PRIVATE llama)
|
29 |
-
|
30 |
# examples
|
31 |
|
32 |
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
@@ -42,8 +21,12 @@ else()
|
|
42 |
add_subdirectory(benchmark)
|
43 |
add_subdirectory(baby-llama)
|
44 |
add_subdirectory(train-text-from-scratch)
|
|
|
45 |
add_subdirectory(simple)
|
|
|
46 |
add_subdirectory(embd-input)
|
|
|
|
|
47 |
if (LLAMA_METAL)
|
48 |
add_subdirectory(metal)
|
49 |
endif()
|
|
|
6 |
|
7 |
# ...
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# examples
|
10 |
|
11 |
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|
|
21 |
add_subdirectory(benchmark)
|
22 |
add_subdirectory(baby-llama)
|
23 |
add_subdirectory(train-text-from-scratch)
|
24 |
+
add_subdirectory(convert-llama2c-to-ggml)
|
25 |
add_subdirectory(simple)
|
26 |
+
add_subdirectory(speculative)
|
27 |
add_subdirectory(embd-input)
|
28 |
+
add_subdirectory(llama-bench)
|
29 |
+
add_subdirectory(beam-search)
|
30 |
if (LLAMA_METAL)
|
31 |
add_subdirectory(metal)
|
32 |
endif()
|
examples/baby-llama/baby-llama.cpp
CHANGED
@@ -9,12 +9,12 @@
|
|
9 |
#endif
|
10 |
|
11 |
#ifdef LLAMA_DEFAULT_RMS_EPS
|
12 |
-
|
13 |
#else
|
14 |
-
|
15 |
#endif
|
16 |
|
17 |
-
float frand() {
|
18 |
return (float)rand()/(float)RAND_MAX;
|
19 |
}
|
20 |
|
@@ -25,19 +25,21 @@ struct random_normal_distribution {
|
|
25 |
float max;
|
26 |
};
|
27 |
|
28 |
-
void init_random_normal_distribution(
|
|
|
|
|
29 |
rnd->gen = std::mt19937(seed);
|
30 |
rnd->nd = std::normal_distribution<float>{mean, std};
|
31 |
rnd->min = min;
|
32 |
rnd->max = max;
|
33 |
}
|
34 |
|
35 |
-
float frand_normal(struct random_normal_distribution * rnd) {
|
36 |
const float r = rnd->nd(rnd->gen);
|
37 |
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
38 |
}
|
39 |
|
40 |
-
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
41 |
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
42 |
|
43 |
if (plan.work_size > 0) {
|
@@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
|
48 |
ggml_graph_compute(graph, &plan);
|
49 |
}
|
50 |
|
51 |
-
struct ggml_tensor * randomize_tensor(
|
52 |
-
|
53 |
-
|
54 |
-
const int64_t ne[],
|
55 |
-
float fmin,
|
56 |
-
float fmax) {
|
57 |
-
|
58 |
switch (ndims) {
|
59 |
case 1:
|
60 |
for (int i0 = 0; i0 < ne[0]; i0++) {
|
@@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
|
|
95 |
return tensor;
|
96 |
}
|
97 |
|
98 |
-
struct ggml_tensor * randomize_tensor_normal(
|
99 |
-
|
100 |
-
|
101 |
-
const int64_t ne[],
|
102 |
-
struct random_normal_distribution * rnd) {
|
103 |
float scale = 1.0; // xavier
|
104 |
switch (ndims) {
|
105 |
case 1:
|
@@ -159,7 +155,7 @@ struct llama_hparams {
|
|
159 |
}
|
160 |
};
|
161 |
|
162 |
-
uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
163 |
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
164 |
return n_ff;
|
165 |
}
|
@@ -260,7 +256,7 @@ struct llama_model_lora {
|
|
260 |
std::vector<llama_layer_lora> layers;
|
261 |
};
|
262 |
|
263 |
-
void init_model(struct llama_model * model) {
|
264 |
const auto & hparams = model->hparams;
|
265 |
|
266 |
const uint32_t n_embd = hparams.n_embd;
|
@@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
|
|
297 |
}
|
298 |
|
299 |
|
300 |
-
void init_model_lora(struct llama_model_lora * model) {
|
301 |
const auto & hparams = model->hparams;
|
302 |
|
303 |
const uint32_t n_embd = hparams.n_embd;
|
@@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
|
|
340 |
}
|
341 |
}
|
342 |
|
343 |
-
void set_param_model(struct llama_model * model) {
|
344 |
const auto& hparams = model->hparams;
|
345 |
|
346 |
const uint32_t n_layer = hparams.n_layer;
|
@@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
|
|
366 |
}
|
367 |
}
|
368 |
|
369 |
-
void set_param_model_lora(struct llama_model_lora * model) {
|
370 |
const auto& hparams = model->hparams;
|
371 |
|
372 |
const uint32_t n_layer = hparams.n_layer;
|
@@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
|
|
397 |
}
|
398 |
}
|
399 |
|
400 |
-
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
401 |
const auto & hparams = model->hparams;
|
402 |
|
403 |
const uint32_t n_layer = hparams.n_layer;
|
@@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
|
|
426 |
}
|
427 |
|
428 |
|
429 |
-
void randomize_model_lora(
|
|
|
|
|
430 |
const auto & hparams = model->hparams;
|
431 |
|
432 |
const uint32_t n_layer = hparams.n_layer;
|
@@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
|
|
459 |
}
|
460 |
}
|
461 |
|
462 |
-
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
463 |
const auto & hparams = model->hparams;
|
464 |
|
465 |
const uint32_t n_ctx = hparams.n_ctx;
|
@@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
|
|
495 |
return true;
|
496 |
}
|
497 |
|
498 |
-
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
499 |
const auto & hparams = model->hparams;
|
500 |
|
501 |
const uint32_t n_ctx = hparams.n_ctx;
|
@@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
|
|
531 |
return true;
|
532 |
}
|
533 |
|
534 |
-
struct ggml_tensor * forward(
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
const int N = n_tokens;
|
544 |
|
545 |
struct llama_kv_cache& kv_self = *cache;
|
@@ -756,25 +754,25 @@ struct ggml_tensor * forward(
|
|
756 |
return inpL;
|
757 |
}
|
758 |
|
759 |
-
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
760 |
GGML_ASSERT(tensor->n_dims == 1);
|
761 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
762 |
}
|
763 |
|
764 |
-
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
765 |
GGML_ASSERT(tensor->n_dims == 2);
|
766 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
767 |
GGML_ASSERT(tensor->ne[1] == ne1);
|
768 |
}
|
769 |
|
770 |
-
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
771 |
GGML_ASSERT(tensor->n_dims == 3);
|
772 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
773 |
GGML_ASSERT(tensor->ne[1] == ne1);
|
774 |
GGML_ASSERT(tensor->ne[2] == ne2);
|
775 |
}
|
776 |
|
777 |
-
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
778 |
GGML_ASSERT(tensor->n_dims == 4);
|
779 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
780 |
GGML_ASSERT(tensor->ne[1] == ne1);
|
@@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
|
782 |
GGML_ASSERT(tensor->ne[3] == ne3);
|
783 |
}
|
784 |
|
785 |
-
struct ggml_tensor * forward_batch(
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
const int N = n_tokens;
|
796 |
|
797 |
struct llama_kv_cache& kv_self = *cache;
|
@@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch(
|
|
1073 |
return inpL;
|
1074 |
}
|
1075 |
|
1076 |
-
|
1077 |
-
struct
|
1078 |
-
|
1079 |
-
|
1080 |
-
|
1081 |
-
|
1082 |
-
|
1083 |
-
|
1084 |
-
|
1085 |
-
|
1086 |
const int N = n_tokens;
|
1087 |
|
1088 |
struct llama_kv_cache& kv_self = *cache;
|
@@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora(
|
|
1328 |
return inpL;
|
1329 |
}
|
1330 |
|
1331 |
-
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
1332 |
assert(logits->n_dims == 2);
|
1333 |
assert(probs->n_dims == 2);
|
1334 |
assert(best_samples->n_dims == 1);
|
@@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
|
|
1359 |
}
|
1360 |
}
|
1361 |
|
1362 |
-
void sample_softmax_batch(
|
|
|
|
|
|
|
1363 |
GGML_ASSERT(best_samples->n_dims == 2);
|
1364 |
GGML_ASSERT(logits->n_dims == 3);
|
1365 |
GGML_ASSERT(probs->n_dims == 3);
|
@@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
|
|
1393 |
}
|
1394 |
}
|
1395 |
|
1396 |
-
void print_row(struct ggml_tensor * probs, int i) {
|
1397 |
for (int k = 0; k < probs->ne[0]; ++k) {
|
1398 |
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
1399 |
printf(" %.2f", p);
|
@@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
|
1401 |
printf("\n");
|
1402 |
}
|
1403 |
|
1404 |
-
void print_matrix(struct ggml_tensor * probs) {
|
1405 |
assert(probs->n_dims == 2);
|
1406 |
for (int i = 0; i < probs->ne[1]; ++i) {
|
1407 |
for (int k = 0; k < probs->ne[0]; ++k) {
|
@@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
|
1412 |
}
|
1413 |
}
|
1414 |
|
1415 |
-
void print_token(int token, int n_vocab) {
|
1416 |
for (int k = 0; k < token; ++k) {
|
1417 |
printf(" ");
|
1418 |
}
|
@@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) {
|
|
1423 |
printf("\n");
|
1424 |
}
|
1425 |
|
1426 |
-
void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
1427 |
for (int i=0; i<tokens->ne[0]; ++i) {
|
1428 |
int token = ggml_get_i32_1d(tokens, i);
|
1429 |
print_token(token, n_vocab);
|
1430 |
}
|
1431 |
}
|
1432 |
|
1433 |
-
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
1434 |
int n_tokens = tokens_input->ne[0];
|
1435 |
int n_vocab = targets->ne[0];
|
1436 |
float randomness = 0.0f;
|
@@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
|
|
1451 |
}
|
1452 |
}
|
1453 |
|
1454 |
-
void get_example_targets_batch(
|
|
|
|
|
1455 |
GGML_ASSERT(tokens_input->n_dims == 2);
|
1456 |
GGML_ASSERT( targets->n_dims == 3);
|
1457 |
int n_tokens = tokens_input->ne[0];
|
@@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
|
|
1474 |
}
|
1475 |
}
|
1476 |
|
1477 |
-
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
1478 |
int n_tokens = tokens_input->ne[0];
|
1479 |
int n_vocab = targets->ne[0];
|
1480 |
for (int i=0; i<n_tokens-n_shift; ++i) {
|
@@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
|
|
1485 |
}
|
1486 |
}
|
1487 |
|
1488 |
-
struct ggml_tensor * square_error_loss(
|
|
|
|
|
1489 |
// todo: instead of a-b: a[1:]-b[:-1]
|
1490 |
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
|
1491 |
}
|
1492 |
|
1493 |
-
struct ggml_tensor * cross_entropy_loss(
|
|
|
|
|
1494 |
const float eps = 1e-3f;
|
1495 |
return
|
1496 |
ggml_sum(ctx,
|
@@ -1617,15 +1623,10 @@ int main(int argc, char ** argv) {
|
|
1617 |
|
1618 |
float error_before_opt = ggml_get_f32_1d(e, 0);
|
1619 |
|
1620 |
-
struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
|
1621 |
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
|
1622 |
-
opt_params_adam.print_forward_graph = false;
|
1623 |
-
opt_params_adam.print_backward_graph = false;
|
1624 |
opt_params_lbfgs.print_forward_graph = false;
|
1625 |
opt_params_lbfgs.print_backward_graph = false;
|
1626 |
-
opt_params_adam.adam.n_iter = 16;
|
1627 |
opt_params_lbfgs.lbfgs.n_iter = 16;
|
1628 |
-
// ggml_opt(ctx0, opt_params_adam, e);
|
1629 |
ggml_opt(ctx0, opt_params_lbfgs, e);
|
1630 |
//
|
1631 |
ggml_build_forward_expand(&gf, e);
|
|
|
9 |
#endif
|
10 |
|
11 |
#ifdef LLAMA_DEFAULT_RMS_EPS
|
12 |
+
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
13 |
#else
|
14 |
+
constexpr float rms_norm_eps = 5e-6f;
|
15 |
#endif
|
16 |
|
17 |
+
static float frand() {
|
18 |
return (float)rand()/(float)RAND_MAX;
|
19 |
}
|
20 |
|
|
|
25 |
float max;
|
26 |
};
|
27 |
|
28 |
+
static void init_random_normal_distribution(
|
29 |
+
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
|
30 |
+
) {
|
31 |
rnd->gen = std::mt19937(seed);
|
32 |
rnd->nd = std::normal_distribution<float>{mean, std};
|
33 |
rnd->min = min;
|
34 |
rnd->max = max;
|
35 |
}
|
36 |
|
37 |
+
static float frand_normal(struct random_normal_distribution * rnd) {
|
38 |
const float r = rnd->nd(rnd->gen);
|
39 |
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
40 |
}
|
41 |
|
42 |
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
43 |
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
44 |
|
45 |
if (plan.work_size > 0) {
|
|
|
50 |
ggml_graph_compute(graph, &plan);
|
51 |
}
|
52 |
|
53 |
+
static struct ggml_tensor * randomize_tensor(
|
54 |
+
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
|
55 |
+
) {
|
|
|
|
|
|
|
|
|
56 |
switch (ndims) {
|
57 |
case 1:
|
58 |
for (int i0 = 0; i0 < ne[0]; i0++) {
|
|
|
93 |
return tensor;
|
94 |
}
|
95 |
|
96 |
+
static struct ggml_tensor * randomize_tensor_normal(
|
97 |
+
struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
|
98 |
+
) {
|
|
|
|
|
99 |
float scale = 1.0; // xavier
|
100 |
switch (ndims) {
|
101 |
case 1:
|
|
|
155 |
}
|
156 |
};
|
157 |
|
158 |
+
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
159 |
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
160 |
return n_ff;
|
161 |
}
|
|
|
256 |
std::vector<llama_layer_lora> layers;
|
257 |
};
|
258 |
|
259 |
+
static void init_model(struct llama_model * model) {
|
260 |
const auto & hparams = model->hparams;
|
261 |
|
262 |
const uint32_t n_embd = hparams.n_embd;
|
|
|
293 |
}
|
294 |
|
295 |
|
296 |
+
static void init_model_lora(struct llama_model_lora * model) {
|
297 |
const auto & hparams = model->hparams;
|
298 |
|
299 |
const uint32_t n_embd = hparams.n_embd;
|
|
|
336 |
}
|
337 |
}
|
338 |
|
339 |
+
static void set_param_model(struct llama_model * model) {
|
340 |
const auto& hparams = model->hparams;
|
341 |
|
342 |
const uint32_t n_layer = hparams.n_layer;
|
|
|
362 |
}
|
363 |
}
|
364 |
|
365 |
+
static void set_param_model_lora(struct llama_model_lora * model) {
|
366 |
const auto& hparams = model->hparams;
|
367 |
|
368 |
const uint32_t n_layer = hparams.n_layer;
|
|
|
393 |
}
|
394 |
}
|
395 |
|
396 |
+
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
397 |
const auto & hparams = model->hparams;
|
398 |
|
399 |
const uint32_t n_layer = hparams.n_layer;
|
|
|
422 |
}
|
423 |
|
424 |
|
425 |
+
static void randomize_model_lora(
|
426 |
+
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
|
427 |
+
) {
|
428 |
const auto & hparams = model->hparams;
|
429 |
|
430 |
const uint32_t n_layer = hparams.n_layer;
|
|
|
457 |
}
|
458 |
}
|
459 |
|
460 |
+
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
461 |
const auto & hparams = model->hparams;
|
462 |
|
463 |
const uint32_t n_ctx = hparams.n_ctx;
|
|
|
493 |
return true;
|
494 |
}
|
495 |
|
496 |
+
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
497 |
const auto & hparams = model->hparams;
|
498 |
|
499 |
const uint32_t n_ctx = hparams.n_ctx;
|
|
|
529 |
return true;
|
530 |
}
|
531 |
|
532 |
+
static struct ggml_tensor * forward(
|
533 |
+
struct llama_model * model,
|
534 |
+
struct llama_kv_cache * cache,
|
535 |
+
struct ggml_context * ctx0,
|
536 |
+
struct ggml_cgraph * gf,
|
537 |
+
struct ggml_tensor * tokens_input,
|
538 |
+
const int n_tokens,
|
539 |
+
const int n_past
|
540 |
+
) {
|
541 |
const int N = n_tokens;
|
542 |
|
543 |
struct llama_kv_cache& kv_self = *cache;
|
|
|
754 |
return inpL;
|
755 |
}
|
756 |
|
757 |
+
static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
758 |
GGML_ASSERT(tensor->n_dims == 1);
|
759 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
760 |
}
|
761 |
|
762 |
+
static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
763 |
GGML_ASSERT(tensor->n_dims == 2);
|
764 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
765 |
GGML_ASSERT(tensor->ne[1] == ne1);
|
766 |
}
|
767 |
|
768 |
+
static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
769 |
GGML_ASSERT(tensor->n_dims == 3);
|
770 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
771 |
GGML_ASSERT(tensor->ne[1] == ne1);
|
772 |
GGML_ASSERT(tensor->ne[2] == ne2);
|
773 |
}
|
774 |
|
775 |
+
static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
776 |
GGML_ASSERT(tensor->n_dims == 4);
|
777 |
GGML_ASSERT(tensor->ne[0] == ne0);
|
778 |
GGML_ASSERT(tensor->ne[1] == ne1);
|
|
|
780 |
GGML_ASSERT(tensor->ne[3] == ne3);
|
781 |
}
|
782 |
|
783 |
+
static struct ggml_tensor * forward_batch(
|
784 |
+
struct llama_model * model,
|
785 |
+
struct llama_kv_cache * cache,
|
786 |
+
struct ggml_context * ctx0,
|
787 |
+
struct ggml_cgraph * gf,
|
788 |
+
struct ggml_tensor * tokens_input,
|
789 |
+
const int n_tokens,
|
790 |
+
const int n_past,
|
791 |
+
const int n_batch
|
792 |
+
) {
|
793 |
const int N = n_tokens;
|
794 |
|
795 |
struct llama_kv_cache& kv_self = *cache;
|
|
|
1071 |
return inpL;
|
1072 |
}
|
1073 |
|
1074 |
+
static struct ggml_tensor * forward_lora(
|
1075 |
+
struct llama_model_lora * model,
|
1076 |
+
struct llama_kv_cache * cache,
|
1077 |
+
struct ggml_context * ctx0,
|
1078 |
+
struct ggml_cgraph * gf,
|
1079 |
+
struct ggml_tensor * tokens_input,
|
1080 |
+
const int n_tokens,
|
1081 |
+
const int n_past
|
1082 |
+
) {
|
|
|
1083 |
const int N = n_tokens;
|
1084 |
|
1085 |
struct llama_kv_cache& kv_self = *cache;
|
|
|
1325 |
return inpL;
|
1326 |
}
|
1327 |
|
1328 |
+
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
1329 |
assert(logits->n_dims == 2);
|
1330 |
assert(probs->n_dims == 2);
|
1331 |
assert(best_samples->n_dims == 1);
|
|
|
1356 |
}
|
1357 |
}
|
1358 |
|
1359 |
+
static void sample_softmax_batch(
|
1360 |
+
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
|
1361 |
+
struct ggml_tensor * best_samples
|
1362 |
+
) {
|
1363 |
GGML_ASSERT(best_samples->n_dims == 2);
|
1364 |
GGML_ASSERT(logits->n_dims == 3);
|
1365 |
GGML_ASSERT(probs->n_dims == 3);
|
|
|
1393 |
}
|
1394 |
}
|
1395 |
|
1396 |
+
static void print_row(struct ggml_tensor * probs, int i) {
|
1397 |
for (int k = 0; k < probs->ne[0]; ++k) {
|
1398 |
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
1399 |
printf(" %.2f", p);
|
|
|
1401 |
printf("\n");
|
1402 |
}
|
1403 |
|
1404 |
+
static void print_matrix(struct ggml_tensor * probs) {
|
1405 |
assert(probs->n_dims == 2);
|
1406 |
for (int i = 0; i < probs->ne[1]; ++i) {
|
1407 |
for (int k = 0; k < probs->ne[0]; ++k) {
|
|
|
1412 |
}
|
1413 |
}
|
1414 |
|
1415 |
+
static void print_token(int token, int n_vocab) {
|
1416 |
for (int k = 0; k < token; ++k) {
|
1417 |
printf(" ");
|
1418 |
}
|
|
|
1423 |
printf("\n");
|
1424 |
}
|
1425 |
|
1426 |
+
static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
1427 |
for (int i=0; i<tokens->ne[0]; ++i) {
|
1428 |
int token = ggml_get_i32_1d(tokens, i);
|
1429 |
print_token(token, n_vocab);
|
1430 |
}
|
1431 |
}
|
1432 |
|
1433 |
+
static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
1434 |
int n_tokens = tokens_input->ne[0];
|
1435 |
int n_vocab = targets->ne[0];
|
1436 |
float randomness = 0.0f;
|
|
|
1451 |
}
|
1452 |
}
|
1453 |
|
1454 |
+
static void get_example_targets_batch(
|
1455 |
+
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
|
1456 |
+
) {
|
1457 |
GGML_ASSERT(tokens_input->n_dims == 2);
|
1458 |
GGML_ASSERT( targets->n_dims == 3);
|
1459 |
int n_tokens = tokens_input->ne[0];
|
|
|
1476 |
}
|
1477 |
}
|
1478 |
|
1479 |
+
static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
1480 |
int n_tokens = tokens_input->ne[0];
|
1481 |
int n_vocab = targets->ne[0];
|
1482 |
for (int i=0; i<n_tokens-n_shift; ++i) {
|
|
|
1487 |
}
|
1488 |
}
|
1489 |
|
1490 |
+
static struct ggml_tensor * square_error_loss(
|
1491 |
+
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
1492 |
+
) {
|
1493 |
// todo: instead of a-b: a[1:]-b[:-1]
|
1494 |
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
|
1495 |
}
|
1496 |
|
1497 |
+
static struct ggml_tensor * cross_entropy_loss(
|
1498 |
+
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
1499 |
+
) {
|
1500 |
const float eps = 1e-3f;
|
1501 |
return
|
1502 |
ggml_sum(ctx,
|
|
|
1623 |
|
1624 |
float error_before_opt = ggml_get_f32_1d(e, 0);
|
1625 |
|
|
|
1626 |
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
|
|
|
|
|
1627 |
opt_params_lbfgs.print_forward_graph = false;
|
1628 |
opt_params_lbfgs.print_backward_graph = false;
|
|
|
1629 |
opt_params_lbfgs.lbfgs.n_iter = 16;
|
|
|
1630 |
ggml_opt(ctx0, opt_params_lbfgs, e);
|
1631 |
//
|
1632 |
ggml_build_forward_expand(&gf, e);
|
examples/beam-search/CMakeLists.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set(TARGET beam-search)
|
2 |
+
add_executable(${TARGET} beam-search.cpp)
|
3 |
+
install(TARGETS ${TARGET} RUNTIME)
|
4 |
+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
5 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
examples/beam-search/beam-search.cpp
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "common.h"
|
2 |
+
#include "llama.h"
|
3 |
+
|
4 |
+
#include <cassert>
|
5 |
+
#include <cinttypes>
|
6 |
+
#include <cmath>
|
7 |
+
#include <cstdio>
|
8 |
+
#include <cstring>
|
9 |
+
#include <ctime>
|
10 |
+
#include <fstream>
|
11 |
+
#include <iostream>
|
12 |
+
#include <string>
|
13 |
+
#include <vector>
|
14 |
+
|
15 |
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
16 |
+
#include <signal.h>
|
17 |
+
#include <unistd.h>
|
18 |
+
#elif defined (_WIN32)
|
19 |
+
#define WIN32_LEAN_AND_MEAN
|
20 |
+
#ifndef NOMINMAX
|
21 |
+
# define NOMINMAX
|
22 |
+
#endif
|
23 |
+
#include <windows.h>
|
24 |
+
#include <signal.h>
|
25 |
+
#endif
|
26 |
+
|
27 |
+
// Used for debugging to print out beam tokens.
|
28 |
+
struct ostream_beam_view {
|
29 |
+
llama_context * ctx;
|
30 |
+
llama_beam_view beam_view;
|
31 |
+
};
|
32 |
+
|
33 |
+
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
34 |
+
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
35 |
+
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
36 |
+
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
37 |
+
}
|
38 |
+
return os << ')';
|
39 |
+
}
|
40 |
+
|
41 |
+
// Put here anything you want back in beam_search_callback().
|
42 |
+
struct beam_search_callback_data {
|
43 |
+
llama_context * ctx;
|
44 |
+
std::vector<llama_token> response;
|
45 |
+
};
|
46 |
+
|
47 |
+
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
48 |
+
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
49 |
+
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
50 |
+
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
51 |
+
}
|
52 |
+
|
53 |
+
// Function matching type llama_beam_search_callback_fn_t.
|
54 |
+
// Custom callback example is called each time the beams lengths increase:
|
55 |
+
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
56 |
+
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
57 |
+
// This is also called when the stop condition is met.
|
58 |
+
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
59 |
+
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
60 |
+
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
61 |
+
// Mark beams as EOS as needed.
|
62 |
+
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
63 |
+
llama_beam_view& beam_view = beams_state.beam_views[i];
|
64 |
+
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
65 |
+
beam_view.eob = true;
|
66 |
+
}
|
67 |
+
}
|
68 |
+
printf(","); // Show progress
|
69 |
+
if (const size_t n = beams_state.common_prefix_length) {
|
70 |
+
callback_data.response.resize(callback_data.response.size() + n);
|
71 |
+
assert(0u < beams_state.n_beams);
|
72 |
+
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
73 |
+
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
74 |
+
printf("%zu", n);
|
75 |
+
}
|
76 |
+
fflush(stdout);
|
77 |
+
#if 1 // DEBUG: print current beams for this iteration
|
78 |
+
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
79 |
+
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
80 |
+
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
81 |
+
}
|
82 |
+
#endif
|
83 |
+
}
|
84 |
+
|
85 |
+
int main(int argc, char ** argv)
|
86 |
+
{
|
87 |
+
gpt_params params;
|
88 |
+
//params.n_gpu_layers = 200;
|
89 |
+
|
90 |
+
//---------------------------------
|
91 |
+
// Print help :
|
92 |
+
//---------------------------------
|
93 |
+
|
94 |
+
if ( argc < 2 || argv[1][0] == '-' )
|
95 |
+
{
|
96 |
+
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
97 |
+
return 1 ;
|
98 |
+
}
|
99 |
+
|
100 |
+
//---------------------------------
|
101 |
+
// Load parameters :
|
102 |
+
//---------------------------------
|
103 |
+
|
104 |
+
params.model = argv[1];
|
105 |
+
|
106 |
+
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
107 |
+
|
108 |
+
if ( argc > 3 )
|
109 |
+
{
|
110 |
+
params.prompt = argv[3];
|
111 |
+
}
|
112 |
+
|
113 |
+
if ( params.prompt.empty() )
|
114 |
+
{
|
115 |
+
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
116 |
+
}
|
117 |
+
|
118 |
+
//---------------------------------
|
119 |
+
// Init LLM :
|
120 |
+
//---------------------------------
|
121 |
+
|
122 |
+
llama_backend_init(params.numa);
|
123 |
+
|
124 |
+
llama_model * model;
|
125 |
+
llama_context * ctx;
|
126 |
+
|
127 |
+
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
128 |
+
|
129 |
+
if ( model == NULL )
|
130 |
+
{
|
131 |
+
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
132 |
+
return 1;
|
133 |
+
}
|
134 |
+
|
135 |
+
//---------------------------------
|
136 |
+
// Tokenize the prompt :
|
137 |
+
//---------------------------------
|
138 |
+
|
139 |
+
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
140 |
+
|
141 |
+
const size_t max_context_size = llama_n_ctx( ctx );
|
142 |
+
const size_t max_tokens_list_size = max_context_size - 4 ;
|
143 |
+
|
144 |
+
if (tokens_list.size() > max_tokens_list_size)
|
145 |
+
{
|
146 |
+
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
147 |
+
__func__ , tokens_list.size() , max_tokens_list_size );
|
148 |
+
return 1;
|
149 |
+
}
|
150 |
+
|
151 |
+
fprintf( stderr, "\n\n" );
|
152 |
+
|
153 |
+
// Print the tokens from the prompt :
|
154 |
+
|
155 |
+
for( auto id : tokens_list )
|
156 |
+
{
|
157 |
+
std::cout << llama_token_to_piece(ctx, id);
|
158 |
+
}
|
159 |
+
std::cout << std::flush;
|
160 |
+
|
161 |
+
int n_past = llama_get_kv_cache_token_count(ctx);
|
162 |
+
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
|
163 |
+
{
|
164 |
+
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
165 |
+
return 1;
|
166 |
+
}
|
167 |
+
n_past += tokens_list.size();
|
168 |
+
|
169 |
+
beam_search_callback_data callback_data{ctx, {}};
|
170 |
+
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
171 |
+
int const n_predict = 256;
|
172 |
+
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
|
173 |
+
|
174 |
+
std::cout << "\n\n";
|
175 |
+
for (llama_token const token_id : callback_data.response) {
|
176 |
+
std::cout << llama_token_to_piece(ctx,token_id);
|
177 |
+
}
|
178 |
+
std::cout << std::endl;
|
179 |
+
|
180 |
+
llama_free( ctx );
|
181 |
+
llama_free_model( model );
|
182 |
+
|
183 |
+
llama_backend_free();
|
184 |
+
|
185 |
+
return 0;
|
186 |
+
}
|
examples/benchmark/CMakeLists.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
set(TARGET benchmark)
|
2 |
add_executable(${TARGET} benchmark-matmult.cpp)
|
3 |
install(TARGETS ${TARGET} RUNTIME)
|
4 |
-
target_link_libraries(${TARGET} PRIVATE
|
|
|
5 |
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
6 |
if(TARGET BUILD_INFO)
|
7 |
add_dependencies(${TARGET} BUILD_INFO)
|
|
|
1 |
set(TARGET benchmark)
|
2 |
add_executable(${TARGET} benchmark-matmult.cpp)
|
3 |
install(TARGETS ${TARGET} RUNTIME)
|
4 |
+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
5 |
+
target_include_directories(${TARGET} PRIVATE ../../common)
|
6 |
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
7 |
if(TARGET BUILD_INFO)
|
8 |
add_dependencies(${TARGET} BUILD_INFO)
|
examples/benchmark/benchmark-matmult.cpp
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
-
#include "ggml.h"
|
2 |
#include "build-info.h"
|
|
|
|
|
3 |
|
4 |
#include <locale.h>
|
5 |
#include <assert.h>
|
@@ -20,7 +21,7 @@
|
|
20 |
#pragma warning(disable: 4244 4267) // possible loss of data
|
21 |
#endif
|
22 |
|
23 |
-
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
24 |
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
25 |
|
26 |
if (plan.work_size > 0) {
|
@@ -31,19 +32,19 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
|
31 |
ggml_graph_compute(graph, &plan);
|
32 |
}
|
33 |
|
34 |
-
float tensor_sum_elements(const ggml_tensor * tensor) {
|
35 |
-
|
36 |
-
if (tensor->type==GGML_TYPE_F32) {
|
37 |
for (int j = 0; j < tensor->ne[1]; j++) {
|
38 |
for (int k = 0; k < tensor->ne[0]; k++) {
|
39 |
-
sum +=
|
40 |
}
|
41 |
}
|
42 |
}
|
43 |
return sum;
|
44 |
}
|
45 |
|
46 |
-
void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
47 |
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
48 |
tensor->type, ggml_type_name(tensor->type),
|
49 |
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
@@ -58,7 +59,7 @@ struct benchmark_params_struct {
|
|
58 |
int32_t n_iterations = 10;
|
59 |
};
|
60 |
|
61 |
-
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
62 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
63 |
fprintf(stderr, "\n");
|
64 |
fprintf(stderr, "options:\n");
|
@@ -99,7 +100,7 @@ int main(int argc, char ** argv) {
|
|
99 |
exit(1);
|
100 |
}
|
101 |
|
102 |
-
|
103 |
printf("Starting Test\n");
|
104 |
|
105 |
// create the ggml context
|
@@ -125,12 +126,15 @@ int main(int argc, char ** argv) {
|
|
125 |
|
126 |
//printf("Memsize required = %i\n", sizex*sizex);
|
127 |
|
|
|
|
|
|
|
128 |
size_t ctx_size = 0;
|
129 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
130 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
131 |
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
|
132 |
-
ctx_size += sizex*sizey*ggml_type_sizef(
|
133 |
-
ctx_size += sizex*sizey*ggml_type_sizef(
|
134 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
135 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
136 |
ctx_size += 1024*1024*16;
|
@@ -163,7 +167,7 @@ int main(int argc, char ** argv) {
|
|
163 |
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
164 |
ggml_set_f32(m2, 2.0f);
|
165 |
|
166 |
-
printf("\n------ Test 1 - Matrix Mult via F32 code
|
167 |
// printf("Creating new tensor m11xm2\n");
|
168 |
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
169 |
|
@@ -181,17 +185,16 @@ int main(int argc, char ** argv) {
|
|
181 |
|
182 |
TENSOR_DUMP(gf.nodes[0]);
|
183 |
|
184 |
-
printf("\n------ Test 2 - Matrix Mult via
|
185 |
|
186 |
int32_t nelements = sizex*sizey;
|
187 |
-
int32_t ne[2] = { sizex, sizey };
|
188 |
|
189 |
std::vector<int64_t> hist_cur(1 << 4, 0);
|
190 |
|
191 |
// Set up a the benchmark matrices
|
192 |
// printf("Creating new tensor q11 & Running quantize\n");
|
193 |
-
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx,
|
194 |
-
|
195 |
|
196 |
// Set up a the compute graph
|
197 |
// printf("Creating new tensor q31\n");
|
@@ -202,8 +205,8 @@ int main(int argc, char ** argv) {
|
|
202 |
|
203 |
// Set up a second graph computation to make sure we override the CPU cache lines
|
204 |
// printf("Creating new tensor q12 & Running quantize\n");
|
205 |
-
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx,
|
206 |
-
|
207 |
|
208 |
// printf("Creating new tensor q32\n");
|
209 |
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
@@ -220,7 +223,7 @@ int main(int argc, char ** argv) {
|
|
220 |
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
221 |
|
222 |
|
223 |
-
// Let's use the F32 result from above as a reference for the
|
224 |
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
225 |
|
226 |
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
@@ -250,7 +253,7 @@ int main(int argc, char ** argv) {
|
|
250 |
// Check that the matrix multiplication result is in the right ballpark
|
251 |
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
252 |
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
253 |
-
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
|
254 |
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
255 |
|
256 |
if (delta > allowed_delta) {
|
|
|
|
|
1 |
#include "build-info.h"
|
2 |
+
#include "common.h"
|
3 |
+
#include "ggml.h"
|
4 |
|
5 |
#include <locale.h>
|
6 |
#include <assert.h>
|
|
|
21 |
#pragma warning(disable: 4244 4267) // possible loss of data
|
22 |
#endif
|
23 |
|
24 |
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
25 |
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
26 |
|
27 |
if (plan.work_size > 0) {
|
|
|
32 |
ggml_graph_compute(graph, &plan);
|
33 |
}
|
34 |
|
35 |
+
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
36 |
+
double sum = 0;
|
37 |
+
if (tensor->type == GGML_TYPE_F32) {
|
38 |
for (int j = 0; j < tensor->ne[1]; j++) {
|
39 |
for (int k = 0; k < tensor->ne[0]; k++) {
|
40 |
+
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
|
41 |
}
|
42 |
}
|
43 |
}
|
44 |
return sum;
|
45 |
}
|
46 |
|
47 |
+
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
48 |
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
49 |
tensor->type, ggml_type_name(tensor->type),
|
50 |
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
|
|
59 |
int32_t n_iterations = 10;
|
60 |
};
|
61 |
|
62 |
+
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
63 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
64 |
fprintf(stderr, "\n");
|
65 |
fprintf(stderr, "options:\n");
|
|
|
100 |
exit(1);
|
101 |
}
|
102 |
|
103 |
+
print_build_info();
|
104 |
printf("Starting Test\n");
|
105 |
|
106 |
// create the ggml context
|
|
|
126 |
|
127 |
//printf("Memsize required = %i\n", sizex*sizex);
|
128 |
|
129 |
+
// TODO: perform the bench for all types or for a user specified type
|
130 |
+
const ggml_type qtype = GGML_TYPE_Q4_1;
|
131 |
+
|
132 |
size_t ctx_size = 0;
|
133 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
134 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
135 |
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
|
136 |
+
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
137 |
+
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
138 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
139 |
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
140 |
ctx_size += 1024*1024*16;
|
|
|
167 |
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
168 |
ggml_set_f32(m2, 2.0f);
|
169 |
|
170 |
+
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
|
171 |
// printf("Creating new tensor m11xm2\n");
|
172 |
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
173 |
|
|
|
185 |
|
186 |
TENSOR_DUMP(gf.nodes[0]);
|
187 |
|
188 |
+
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
189 |
|
190 |
int32_t nelements = sizex*sizey;
|
|
|
191 |
|
192 |
std::vector<int64_t> hist_cur(1 << 4, 0);
|
193 |
|
194 |
// Set up a the benchmark matrices
|
195 |
// printf("Creating new tensor q11 & Running quantize\n");
|
196 |
+
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
197 |
+
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
|
198 |
|
199 |
// Set up a the compute graph
|
200 |
// printf("Creating new tensor q31\n");
|
|
|
205 |
|
206 |
// Set up a second graph computation to make sure we override the CPU cache lines
|
207 |
// printf("Creating new tensor q12 & Running quantize\n");
|
208 |
+
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
209 |
+
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
|
210 |
|
211 |
// printf("Creating new tensor q32\n");
|
212 |
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
|
|
223 |
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
224 |
|
225 |
|
226 |
+
// Let's use the F32 result from above as a reference for the quantized multiplication
|
227 |
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
228 |
|
229 |
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
|
|
253 |
// Check that the matrix multiplication result is in the right ballpark
|
254 |
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
255 |
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
256 |
+
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
257 |
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
258 |
|
259 |
if (delta > allowed_delta) {
|
examples/chat.sh
CHANGED
@@ -11,6 +11,6 @@ cd ..
|
|
11 |
#
|
12 |
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
13 |
#
|
14 |
-
./main -m ./models/
|
15 |
--repeat_penalty 1.0 --color -i \
|
16 |
-r "User:" -f prompts/chat-with-bob.txt
|
|
|
11 |
#
|
12 |
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
13 |
#
|
14 |
+
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
15 |
--repeat_penalty 1.0 --color -i \
|
16 |
-r "User:" -f prompts/chat-with-bob.txt
|
examples/convert-llama2c-to-ggml/CMakeLists.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set(TARGET convert-llama2c-to-ggml)
|
2 |
+
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
3 |
+
install(TARGETS ${TARGET} RUNTIME)
|
4 |
+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
5 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
examples/convert-llama2c-to-ggml/README.md
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Convert llama2.c model to ggml
|
2 |
+
|
3 |
+
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
|
4 |
+
|
5 |
+
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
|
6 |
+
|
7 |
+
`$ make -j`
|
8 |
+
|
9 |
+
After successful compilation, following usage options are available:
|
10 |
+
```
|
11 |
+
usage: ./convert-llama2c-to-ggml [options]
|
12 |
+
|
13 |
+
options:
|
14 |
+
-h, --help show this help message and exit
|
15 |
+
--copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
|
16 |
+
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
|
17 |
+
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
|
18 |
+
```
|
19 |
+
|
20 |
+
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
21 |
+
|
22 |
+
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
23 |
+
|
24 |
+
Now you can use the model with a command like:
|
25 |
+
|
26 |
+
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
ADDED
@@ -0,0 +1,963 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "ggml.h"
|
2 |
+
#include "llama.h"
|
3 |
+
#include "common.h"
|
4 |
+
|
5 |
+
#include <unordered_map>
|
6 |
+
#include <vector>
|
7 |
+
#include <cassert>
|
8 |
+
#include <climits>
|
9 |
+
#include <cstring>
|
10 |
+
#include <cstdarg>
|
11 |
+
#include <ctime>
|
12 |
+
#include <random>
|
13 |
+
#include <stdexcept>
|
14 |
+
#include <sstream>
|
15 |
+
#include <algorithm>
|
16 |
+
#include <string>
|
17 |
+
|
18 |
+
// GGUF keys & tensor names.
|
19 |
+
|
20 |
+
#define KV_GENERAL_ARCHITECTURE "general.architecture"
|
21 |
+
#define KV_GENERAL_NAME "general.name"
|
22 |
+
|
23 |
+
#define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
|
24 |
+
#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
|
25 |
+
#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
|
26 |
+
#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
|
27 |
+
#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
|
28 |
+
#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
|
29 |
+
#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
|
30 |
+
#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
|
31 |
+
#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
|
32 |
+
#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
|
33 |
+
|
34 |
+
#define KV_CONTEXT_LENGTH "llama.context_length"
|
35 |
+
#define KV_EMBEDDING_LENGTH "llama.embedding_length"
|
36 |
+
#define KV_BLOCK_COUNT "llama.block_count"
|
37 |
+
#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
|
38 |
+
#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
|
39 |
+
#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
|
40 |
+
#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
|
41 |
+
#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
|
42 |
+
|
43 |
+
#define TN_TOKEN_EMBD "token_embd.weight"
|
44 |
+
#define TN_OUTPUT_NORM "output_norm.weight"
|
45 |
+
#define TN_OUTPUT "output.weight"
|
46 |
+
#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
|
47 |
+
#define TN_ATTN_Q "blk.%d.attn_q.weight"
|
48 |
+
#define TN_ATTN_K "blk.%d.attn_k.weight"
|
49 |
+
#define TN_ATTN_V "blk.%d.attn_v.weight"
|
50 |
+
#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
|
51 |
+
#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
|
52 |
+
#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
|
53 |
+
#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
|
54 |
+
#define TN_FFN_UP "blk.%d.ffn_up.weight"
|
55 |
+
|
56 |
+
#if defined(_MSC_VER)
|
57 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
58 |
+
#endif
|
59 |
+
|
60 |
+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
61 |
+
#define LLAMA_FILE_VERSION_GGJT_V3 3
|
62 |
+
|
63 |
+
#define TOKENIZER_NAME "llama"
|
64 |
+
#define UNKNOWN_TOKEN_ID 0
|
65 |
+
#define BOS_TOKEN_ID 1
|
66 |
+
#define EOS_TOKEN_ID 2
|
67 |
+
|
68 |
+
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
|
69 |
+
typedef struct {
|
70 |
+
int dim; // transformer dimension
|
71 |
+
int hidden_dim; // for ffn layers
|
72 |
+
int n_layers; // number of layers
|
73 |
+
int n_heads; // number of query heads
|
74 |
+
int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
|
75 |
+
int vocab_size; // vocabulary size, usually 256 (byte-level)
|
76 |
+
int seq_len; // max sequence length
|
77 |
+
} Config;
|
78 |
+
|
79 |
+
struct TransformerWeights {
|
80 |
+
// token embedding table
|
81 |
+
float* token_embedding_table; // (vocab_size, dim)
|
82 |
+
// weights for rmsnorms
|
83 |
+
float* rms_att_weight; // (layer, dim) rmsnorm weights
|
84 |
+
float* rms_ffn_weight; // (layer, dim)
|
85 |
+
// weights for matmuls
|
86 |
+
float* wq; // (layer, dim, dim)
|
87 |
+
float* wk; // (layer, dim, dim)
|
88 |
+
float* wv; // (layer, dim, dim)
|
89 |
+
float* wo; // (layer, dim, dim)
|
90 |
+
// weights for ffn
|
91 |
+
float* w1; // (layer, hidden_dim, dim)
|
92 |
+
float* w2; // (layer, dim, hidden_dim)
|
93 |
+
float* w3; // (layer, hidden_dim, dim)
|
94 |
+
// final rmsnorm
|
95 |
+
float* rms_final_weight; // (dim,)
|
96 |
+
// freq_cis for RoPE relatively positional embeddings
|
97 |
+
// float* freq_cis_real; // (seq_len, dim/2)
|
98 |
+
// float* freq_cis_imag; // (seq_len, dim/2)
|
99 |
+
// (optional) classifier weights for the logits, on the last layer
|
100 |
+
float* wcls;
|
101 |
+
|
102 |
+
~TransformerWeights() {
|
103 |
+
delete[] token_embedding_table;
|
104 |
+
delete[] rms_att_weight;
|
105 |
+
delete[] rms_ffn_weight;
|
106 |
+
delete[] wq;
|
107 |
+
delete[] wk;
|
108 |
+
delete[] wv;
|
109 |
+
delete[] wo;
|
110 |
+
delete[] w1;
|
111 |
+
delete[] w2;
|
112 |
+
delete[] w3;
|
113 |
+
delete[] rms_final_weight;
|
114 |
+
delete[] wcls;
|
115 |
+
}
|
116 |
+
};
|
117 |
+
|
118 |
+
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
119 |
+
// we calloc instead of malloc to keep valgrind happy
|
120 |
+
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
121 |
+
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
122 |
+
|
123 |
+
w->rms_att_weight = new float[p->n_layers * p->dim]();
|
124 |
+
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
125 |
+
|
126 |
+
w->rms_ffn_weight = new float[p->n_layers * p->dim]();
|
127 |
+
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
128 |
+
|
129 |
+
w->wq = new float[p->n_layers * p->dim * p->dim]();
|
130 |
+
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
131 |
+
|
132 |
+
w->wk = new float[p->n_layers * p->dim * p->dim]();
|
133 |
+
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
134 |
+
|
135 |
+
w->wv = new float[p->n_layers * p->dim * p->dim]();
|
136 |
+
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
137 |
+
|
138 |
+
w->wo = new float[p->n_layers * p->dim * p->dim]();
|
139 |
+
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
140 |
+
|
141 |
+
w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
142 |
+
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
143 |
+
|
144 |
+
w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
145 |
+
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
146 |
+
|
147 |
+
w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
148 |
+
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
149 |
+
|
150 |
+
w->rms_final_weight = new float[p->dim]();
|
151 |
+
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
152 |
+
|
153 |
+
if (shared_weights) {
|
154 |
+
w->wcls = NULL;
|
155 |
+
} else {
|
156 |
+
w->wcls = new float[p->vocab_size * p->dim]();
|
157 |
+
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
158 |
+
}
|
159 |
+
}
|
160 |
+
|
161 |
+
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
162 |
+
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
163 |
+
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
164 |
+
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
165 |
+
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
166 |
+
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
167 |
+
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
168 |
+
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
169 |
+
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
170 |
+
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
|
171 |
+
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
172 |
+
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
|
173 |
+
|
174 |
+
// Skip freq_cis_real & freq_cis_imag
|
175 |
+
int head_size = p->dim / p->n_heads;
|
176 |
+
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
|
177 |
+
|
178 |
+
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
179 |
+
|
180 |
+
// Check we didn't forget to read anything
|
181 |
+
auto curr = ftell(f);
|
182 |
+
fseek(f, 0, SEEK_END);
|
183 |
+
auto end = ftell(f);
|
184 |
+
if (curr != end) {
|
185 |
+
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
|
186 |
+
return 1;
|
187 |
+
}
|
188 |
+
|
189 |
+
return 0;
|
190 |
+
}
|
191 |
+
|
192 |
+
static void print_sample_weights(TransformerWeights *w){
|
193 |
+
printf("----- Quick print of first of the weight vales of all the variables\n");
|
194 |
+
printf("%f\n", w->token_embedding_table[0]);
|
195 |
+
printf("%f\n", w->rms_att_weight[0]);
|
196 |
+
printf("%f\n", w->rms_ffn_weight[0]);
|
197 |
+
|
198 |
+
printf("%f\n", w->wq[0]);
|
199 |
+
printf("%f\n", w->wk[0]);
|
200 |
+
printf("%f\n", w->wv[0]);
|
201 |
+
printf("%f\n", w->wo[0]);
|
202 |
+
printf("%f\n", w->w1[0]);
|
203 |
+
printf("%f\n", w->w2[0]);
|
204 |
+
printf("%f\n", w->w3[0]);
|
205 |
+
printf("%f\n", w->rms_att_weight[0]);
|
206 |
+
if (w->wcls) printf("%f\n", w->wcls[0]);
|
207 |
+
}
|
208 |
+
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
209 |
+
|
210 |
+
//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
|
211 |
+
|
212 |
+
struct llama_vocab {
|
213 |
+
using id = int32_t;
|
214 |
+
using token = std::string;
|
215 |
+
using ttype = llama_token_type;
|
216 |
+
|
217 |
+
struct token_data {
|
218 |
+
token text;
|
219 |
+
float score;
|
220 |
+
ttype type;
|
221 |
+
};
|
222 |
+
|
223 |
+
std::unordered_map<token, id> token_to_id;
|
224 |
+
std::vector<token_data> id_to_token;
|
225 |
+
};
|
226 |
+
|
227 |
+
struct my_llama_hparams {
|
228 |
+
uint32_t n_vocab = 32000;
|
229 |
+
uint32_t n_ctx = 512; // this is provided as user input?
|
230 |
+
uint32_t n_embd = 4096;
|
231 |
+
uint32_t n_ff = 11008;
|
232 |
+
uint32_t n_mult = 4;
|
233 |
+
uint32_t n_head = 32;
|
234 |
+
uint32_t n_layer = 32;
|
235 |
+
uint32_t n_rot = 64;
|
236 |
+
bool operator!=(const my_llama_hparams& other) const {
|
237 |
+
return memcmp(this, &other, sizeof(my_llama_hparams));
|
238 |
+
}
|
239 |
+
};
|
240 |
+
|
241 |
+
struct my_llama_layer {
|
242 |
+
// normalization
|
243 |
+
struct ggml_tensor * attention_norm;
|
244 |
+
|
245 |
+
// attention
|
246 |
+
struct ggml_tensor * wq;
|
247 |
+
struct ggml_tensor * wk;
|
248 |
+
struct ggml_tensor * wv;
|
249 |
+
struct ggml_tensor * wo;
|
250 |
+
|
251 |
+
// normalization
|
252 |
+
struct ggml_tensor * ffn_norm;
|
253 |
+
|
254 |
+
// ff
|
255 |
+
struct ggml_tensor * w1;
|
256 |
+
struct ggml_tensor * w2;
|
257 |
+
struct ggml_tensor * w3;
|
258 |
+
};
|
259 |
+
|
260 |
+
struct my_llama_model {
|
261 |
+
struct ggml_context * ctx = NULL;
|
262 |
+
|
263 |
+
std::string name;
|
264 |
+
|
265 |
+
my_llama_hparams hparams;
|
266 |
+
|
267 |
+
struct ggml_tensor * tok_embeddings;
|
268 |
+
|
269 |
+
struct ggml_tensor * norm;
|
270 |
+
struct ggml_tensor * output;
|
271 |
+
|
272 |
+
std::vector<my_llama_layer> layers;
|
273 |
+
|
274 |
+
uint32_t train_its = 0;
|
275 |
+
uint32_t train_samples = 0;
|
276 |
+
uint32_t train_tokens = 0;
|
277 |
+
};
|
278 |
+
|
279 |
+
struct train_params {
|
280 |
+
const char * fn_vocab_model;
|
281 |
+
const char * fn_llama2c_model;
|
282 |
+
const char * fn_llama2c_output_model;
|
283 |
+
const char * fn_train_data;
|
284 |
+
const char * fn_checkpoint_in;
|
285 |
+
const char * fn_checkpoint_out;
|
286 |
+
const char * fn_model_out;
|
287 |
+
|
288 |
+
uint32_t seed;
|
289 |
+
|
290 |
+
int n_ctx;
|
291 |
+
int n_embd;
|
292 |
+
int n_mult;
|
293 |
+
int n_head;
|
294 |
+
int n_layer;
|
295 |
+
int n_rotmax;
|
296 |
+
|
297 |
+
int n_threads;
|
298 |
+
int n_batch;
|
299 |
+
int n_examples;
|
300 |
+
int n_predict;
|
301 |
+
|
302 |
+
int print_info_interval;
|
303 |
+
int print_details_interval;
|
304 |
+
|
305 |
+
bool samples_start_after_nl;
|
306 |
+
bool use_adam;
|
307 |
+
bool use_flash;
|
308 |
+
bool use_scratch;
|
309 |
+
|
310 |
+
// only adam
|
311 |
+
int warmup;
|
312 |
+
int cos_decay_steps;
|
313 |
+
float cos_decay_restart;
|
314 |
+
float cos_decay_alpha;
|
315 |
+
|
316 |
+
int lbfgs_n_iter;
|
317 |
+
int adam_n_iter;
|
318 |
+
float adam_alpha;
|
319 |
+
float adam_decay;
|
320 |
+
|
321 |
+
int mem_model_gb;
|
322 |
+
int mem_compute_gb;
|
323 |
+
int mem_compute0_gb;
|
324 |
+
int mem_compute1_gb;
|
325 |
+
};
|
326 |
+
|
327 |
+
static void print_params(struct my_llama_hparams * params) {
|
328 |
+
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
329 |
+
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
330 |
+
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
331 |
+
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
332 |
+
printf("%s: n_head: %d\n", __func__, params->n_head);
|
333 |
+
printf("%s: n_ff: %d\n", __func__, params->n_ff);
|
334 |
+
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
335 |
+
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
336 |
+
}
|
337 |
+
|
338 |
+
static void init_model(struct my_llama_model * model) {
|
339 |
+
const auto & hparams = model->hparams;
|
340 |
+
|
341 |
+
const uint32_t n_embd = hparams.n_embd;
|
342 |
+
const uint32_t n_layer = hparams.n_layer;
|
343 |
+
const uint32_t n_vocab = hparams.n_vocab;
|
344 |
+
|
345 |
+
const uint32_t n_ff = hparams.n_ff;
|
346 |
+
struct ggml_context * ctx = model->ctx;
|
347 |
+
|
348 |
+
model->train_its = 0;
|
349 |
+
model->train_samples = 0;
|
350 |
+
model->train_tokens = 0;
|
351 |
+
|
352 |
+
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
353 |
+
printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
|
354 |
+
|
355 |
+
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
356 |
+
printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
|
357 |
+
|
358 |
+
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
359 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
|
360 |
+
|
361 |
+
// printing the per-layer allocations here so we dont print in the for loop.
|
362 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
363 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
364 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
365 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
366 |
+
|
367 |
+
printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
|
368 |
+
|
369 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
370 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
|
371 |
+
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
372 |
+
|
373 |
+
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
|
374 |
+
ggml_set_name(model->norm, "norm.weight");
|
375 |
+
ggml_set_name(model->output, "output.weight");
|
376 |
+
|
377 |
+
model->layers.resize(n_layer);
|
378 |
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
379 |
+
auto & layer = model->layers[i];
|
380 |
+
|
381 |
+
std::string layers_i = "layers." + std::to_string(i);
|
382 |
+
|
383 |
+
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
384 |
+
|
385 |
+
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
386 |
+
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
387 |
+
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
388 |
+
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
389 |
+
|
390 |
+
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
391 |
+
|
392 |
+
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
393 |
+
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
394 |
+
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
395 |
+
|
396 |
+
ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
|
397 |
+
|
398 |
+
ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
|
399 |
+
ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
|
400 |
+
ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
|
401 |
+
ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
|
402 |
+
|
403 |
+
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
|
404 |
+
|
405 |
+
ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
|
406 |
+
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
|
407 |
+
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
|
408 |
+
}
|
409 |
+
}
|
410 |
+
|
411 |
+
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
412 |
+
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
413 |
+
return *ptr;
|
414 |
+
}
|
415 |
+
|
416 |
+
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
417 |
+
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
418 |
+
return *ptr;
|
419 |
+
}
|
420 |
+
|
421 |
+
static void print_row(struct ggml_tensor * probs, int i) {
|
422 |
+
for (int k = 0; k < probs->ne[0]; ++k) {
|
423 |
+
float p = get_f32_2d(probs, k, i);
|
424 |
+
printf(" %f", p);
|
425 |
+
}
|
426 |
+
printf("\n");
|
427 |
+
}
|
428 |
+
|
429 |
+
static void print_matrix(struct ggml_tensor * probs) {
|
430 |
+
assert(probs->n_dims == 2);
|
431 |
+
for (int i = 0; i < probs->ne[1]; ++i) {
|
432 |
+
for (int k = 0; k < probs->ne[0]; ++k) {
|
433 |
+
float p = get_f32_2d(probs, k, i);
|
434 |
+
printf(" %.2f", p);
|
435 |
+
}
|
436 |
+
printf("\n");
|
437 |
+
}
|
438 |
+
}
|
439 |
+
|
440 |
+
#ifdef __GNUC__
|
441 |
+
#ifdef __MINGW32__
|
442 |
+
__attribute__((format(gnu_printf, 1, 2)))
|
443 |
+
#else
|
444 |
+
__attribute__((format(printf, 1, 2)))
|
445 |
+
#endif
|
446 |
+
#endif
|
447 |
+
static std::string format(const char * fmt, ...) {
|
448 |
+
va_list ap, ap2;
|
449 |
+
va_start(ap, fmt);
|
450 |
+
va_copy(ap2, ap);
|
451 |
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
452 |
+
GGML_ASSERT(size >= 0 && size < INT_MAX);
|
453 |
+
std::vector<char> buf(size + 1);
|
454 |
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
455 |
+
GGML_ASSERT(size2 == size);
|
456 |
+
va_end(ap2);
|
457 |
+
va_end(ap);
|
458 |
+
return std::string(buf.data(), size);
|
459 |
+
}
|
460 |
+
|
461 |
+
struct llama_file {
|
462 |
+
// use FILE * so we don't have to re-open the file to mmap
|
463 |
+
FILE * fp;
|
464 |
+
size_t size;
|
465 |
+
|
466 |
+
llama_file(const char * fname, const char * mode) {
|
467 |
+
fp = std::fopen(fname, mode);
|
468 |
+
if (fp == NULL) {
|
469 |
+
size = 0;
|
470 |
+
} else {
|
471 |
+
seek(0, SEEK_END);
|
472 |
+
size = tell();
|
473 |
+
seek(0, SEEK_SET);
|
474 |
+
}
|
475 |
+
}
|
476 |
+
|
477 |
+
size_t tell() const {
|
478 |
+
#ifdef _WIN32
|
479 |
+
__int64 ret = _ftelli64(fp);
|
480 |
+
#else
|
481 |
+
long ret = std::ftell(fp);
|
482 |
+
#endif
|
483 |
+
GGML_ASSERT(ret != -1); // this really shouldn't fail
|
484 |
+
return (size_t) ret;
|
485 |
+
}
|
486 |
+
|
487 |
+
void seek(size_t offset, int whence) {
|
488 |
+
#ifdef _WIN32
|
489 |
+
int ret = _fseeki64(fp, (__int64) offset, whence);
|
490 |
+
#else
|
491 |
+
int ret = std::fseek(fp, (long) offset, whence);
|
492 |
+
#endif
|
493 |
+
GGML_ASSERT(ret == 0); // same
|
494 |
+
}
|
495 |
+
|
496 |
+
void read_raw(void * ptr, size_t size) {
|
497 |
+
if (size == 0) {
|
498 |
+
return;
|
499 |
+
}
|
500 |
+
errno = 0;
|
501 |
+
std::size_t ret = std::fread(ptr, size, 1, fp);
|
502 |
+
if (ferror(fp)) {
|
503 |
+
die_fmt("fread failed: %s", strerror(errno));
|
504 |
+
}
|
505 |
+
if (ret != 1) {
|
506 |
+
die("unexpectedly reached end of file");
|
507 |
+
}
|
508 |
+
}
|
509 |
+
|
510 |
+
std::uint32_t read_u32() {
|
511 |
+
std::uint32_t ret;
|
512 |
+
read_raw(&ret, sizeof(ret));
|
513 |
+
return ret;
|
514 |
+
}
|
515 |
+
std::float_t read_f32() {
|
516 |
+
std::float_t ret;
|
517 |
+
read_raw(&ret, sizeof(ret));
|
518 |
+
return ret;
|
519 |
+
}
|
520 |
+
|
521 |
+
std::string read_string(std::uint32_t len) {
|
522 |
+
std::vector<char> chars(len);
|
523 |
+
read_raw(chars.data(), len);
|
524 |
+
return std::string(chars.data(), len);
|
525 |
+
}
|
526 |
+
|
527 |
+
~llama_file() {
|
528 |
+
if (fp) {
|
529 |
+
std::fclose(fp);
|
530 |
+
}
|
531 |
+
}
|
532 |
+
};
|
533 |
+
|
534 |
+
static bool is_ggml_file(const char * filename) {
|
535 |
+
llama_file file(filename, "rb");
|
536 |
+
if (file.size < 4) {
|
537 |
+
return false;
|
538 |
+
}
|
539 |
+
uint32_t magic = file.read_u32();
|
540 |
+
return magic == GGUF_MAGIC;
|
541 |
+
}
|
542 |
+
|
543 |
+
static std::string llama_escape_whitespaces(const std::string & text) {
|
544 |
+
std::ostringstream out;
|
545 |
+
for (char c : text) {
|
546 |
+
if (c == ' ') out << "\xe2\x96\x81";
|
547 |
+
else out << c;
|
548 |
+
}
|
549 |
+
return out.str();
|
550 |
+
}
|
551 |
+
|
552 |
+
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
553 |
+
if (is_ggml_file(filename)) {
|
554 |
+
struct ggml_context * ctx_data = NULL;
|
555 |
+
|
556 |
+
struct gguf_init_params params = {
|
557 |
+
/*.no_alloc = */ false,
|
558 |
+
/*.ctx = */ &ctx_data,
|
559 |
+
};
|
560 |
+
|
561 |
+
struct gguf_context * ctx = gguf_init_from_file(filename, params);
|
562 |
+
GGML_ASSERT(ctx != NULL);
|
563 |
+
|
564 |
+
const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
|
565 |
+
GGML_ASSERT(model_idx >= 0);
|
566 |
+
std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
|
567 |
+
GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
|
568 |
+
|
569 |
+
const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
|
570 |
+
GGML_ASSERT(token_idx >= 0);
|
571 |
+
|
572 |
+
const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
|
573 |
+
GGML_ASSERT(score_idx >= 0);
|
574 |
+
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
575 |
+
|
576 |
+
const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
|
577 |
+
GGML_ASSERT(toktype_idx >= 0);
|
578 |
+
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
579 |
+
|
580 |
+
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
581 |
+
|
582 |
+
vocab->id_to_token.resize(n_vocab);
|
583 |
+
|
584 |
+
for (uint32_t i = 0; i < n_vocab; i++) {
|
585 |
+
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
586 |
+
|
587 |
+
vocab->token_to_id[word] = i;
|
588 |
+
|
589 |
+
auto & token_data = vocab->id_to_token[i];
|
590 |
+
token_data.text = std::move(word);
|
591 |
+
token_data.score = scores[i];
|
592 |
+
token_data.type = (llama_token_type) toktypes[i];
|
593 |
+
}
|
594 |
+
ggml_free(ctx_data);
|
595 |
+
gguf_free(ctx);
|
596 |
+
} else {
|
597 |
+
// assume llama2.c vocabulary
|
598 |
+
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
|
599 |
+
llama_file file(filename, "rb");
|
600 |
+
if (!file.fp) {
|
601 |
+
die_fmt("%s: %s", strerror(errno), filename);
|
602 |
+
}
|
603 |
+
const int n_vocab = config->vocab_size;
|
604 |
+
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
605 |
+
vocab->id_to_token.resize(n_vocab);
|
606 |
+
for (llama_vocab::id id=0; id<n_vocab; ++id) {
|
607 |
+
float_t score = file.read_f32();
|
608 |
+
uint32_t len = file.read_u32();
|
609 |
+
std::string text = file.read_string(len);
|
610 |
+
|
611 |
+
unsigned char byte_val;
|
612 |
+
llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
|
613 |
+
if (id == UNKNOWN_TOKEN_ID) {
|
614 |
+
text = "<unk>";
|
615 |
+
type = LLAMA_TOKEN_TYPE_UNKNOWN;
|
616 |
+
} else if (id == BOS_TOKEN_ID) {
|
617 |
+
text = "<s>";
|
618 |
+
type = LLAMA_TOKEN_TYPE_CONTROL;
|
619 |
+
} else if (id == EOS_TOKEN_ID) {
|
620 |
+
text = "</s>";
|
621 |
+
type = LLAMA_TOKEN_TYPE_CONTROL;
|
622 |
+
} else if (text.empty()) {
|
623 |
+
type = LLAMA_TOKEN_TYPE_CONTROL;
|
624 |
+
} else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
625 |
+
// Text of byte tokens is already in the expected format.
|
626 |
+
type = LLAMA_TOKEN_TYPE_BYTE;
|
627 |
+
} else {
|
628 |
+
type = LLAMA_TOKEN_TYPE_NORMAL;
|
629 |
+
}
|
630 |
+
text = llama_escape_whitespaces(text);
|
631 |
+
|
632 |
+
vocab->id_to_token[id].text = text;
|
633 |
+
vocab->id_to_token[id].score = score;
|
634 |
+
vocab->id_to_token[id].type = type;
|
635 |
+
vocab->token_to_id.emplace(text, id);
|
636 |
+
}
|
637 |
+
}
|
638 |
+
}
|
639 |
+
|
640 |
+
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
641 |
+
int ct;
|
642 |
+
switch (gg_weights->n_dims){
|
643 |
+
case 1:
|
644 |
+
ct = 0;
|
645 |
+
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
|
646 |
+
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
|
647 |
+
*ptr = karpathy_weights[ct];
|
648 |
+
ct++;
|
649 |
+
}
|
650 |
+
break;
|
651 |
+
case 2:
|
652 |
+
ct = 0;
|
653 |
+
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
|
654 |
+
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
|
655 |
+
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
|
656 |
+
*ptr = karpathy_weights[ct];
|
657 |
+
ct++;
|
658 |
+
}
|
659 |
+
}
|
660 |
+
break;
|
661 |
+
case 3:
|
662 |
+
ct = 0;
|
663 |
+
for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
|
664 |
+
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
|
665 |
+
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
|
666 |
+
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
|
667 |
+
*ptr = karpathy_weights[ct];
|
668 |
+
ct++;
|
669 |
+
}
|
670 |
+
}
|
671 |
+
}
|
672 |
+
break;
|
673 |
+
}
|
674 |
+
}
|
675 |
+
|
676 |
+
static void save_as_llama_model(
|
677 |
+
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
|
678 |
+
) {
|
679 |
+
// convert AK weights into GG weights one by one.
|
680 |
+
// w->token_embedding_table -> model->tok_embeddings
|
681 |
+
// float* -> struct ggml_tensor
|
682 |
+
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
|
683 |
+
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
684 |
+
|
685 |
+
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
|
686 |
+
//print_row(model->norm, 0);
|
687 |
+
|
688 |
+
// for rms-att-weight
|
689 |
+
int row_length = model->hparams.n_embd;
|
690 |
+
int n_ff = model->hparams.n_ff;
|
691 |
+
|
692 |
+
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
693 |
+
auto & layer = model->layers[i];
|
694 |
+
// 1d
|
695 |
+
convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
696 |
+
convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
697 |
+
|
698 |
+
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
699 |
+
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
700 |
+
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
701 |
+
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
702 |
+
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
703 |
+
|
704 |
+
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
705 |
+
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
706 |
+
convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
707 |
+
}
|
708 |
+
|
709 |
+
struct gguf_context * ctx = gguf_init_empty();
|
710 |
+
|
711 |
+
std::vector<const char*> tokens;
|
712 |
+
std::vector<float> scores;
|
713 |
+
std::vector<llama_token_type> token_types;
|
714 |
+
for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
|
715 |
+
tokens.push_back(token_data.text.c_str());
|
716 |
+
scores.push_back(token_data.score);
|
717 |
+
token_types.push_back(token_data.type);
|
718 |
+
}
|
719 |
+
gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
|
720 |
+
gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
|
721 |
+
gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
|
722 |
+
|
723 |
+
gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
|
724 |
+
|
725 |
+
gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
|
726 |
+
gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
|
727 |
+
|
728 |
+
// special tokens
|
729 |
+
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
730 |
+
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
731 |
+
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
732 |
+
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
|
733 |
+
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
|
734 |
+
|
735 |
+
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
736 |
+
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
737 |
+
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
|
738 |
+
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
739 |
+
// n_head_kv is optional, default to n_head
|
740 |
+
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
|
741 |
+
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
|
742 |
+
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
|
743 |
+
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
|
744 |
+
|
745 |
+
// write tensors
|
746 |
+
ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
|
747 |
+
gguf_add_tensor(ctx, model->tok_embeddings);
|
748 |
+
|
749 |
+
ggml_set_name(model->norm, TN_OUTPUT_NORM);
|
750 |
+
gguf_add_tensor(ctx, model->norm);
|
751 |
+
|
752 |
+
ggml_set_name(model->output, TN_OUTPUT);
|
753 |
+
gguf_add_tensor(ctx, model->output);
|
754 |
+
|
755 |
+
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
756 |
+
auto & layer = model->layers[i];
|
757 |
+
|
758 |
+
ggml_format_name(layer.wq, TN_ATTN_Q, i);
|
759 |
+
gguf_add_tensor(ctx, layer.wq);
|
760 |
+
|
761 |
+
ggml_format_name(layer.wk, TN_ATTN_K, i);
|
762 |
+
gguf_add_tensor(ctx, layer.wk);
|
763 |
+
|
764 |
+
ggml_format_name(layer.wv, TN_ATTN_V, i);
|
765 |
+
gguf_add_tensor(ctx, layer.wv);
|
766 |
+
|
767 |
+
ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
|
768 |
+
gguf_add_tensor(ctx, layer.wo);
|
769 |
+
|
770 |
+
ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
|
771 |
+
gguf_add_tensor(ctx, layer.attention_norm);
|
772 |
+
|
773 |
+
ggml_format_name(layer.w1, TN_FFN_GATE, i);
|
774 |
+
gguf_add_tensor(ctx, layer.w1);
|
775 |
+
|
776 |
+
ggml_format_name(layer.w2, TN_FFN_DOWN, i);
|
777 |
+
gguf_add_tensor(ctx, layer.w2);
|
778 |
+
|
779 |
+
ggml_format_name(layer.w3, TN_FFN_UP, i);
|
780 |
+
gguf_add_tensor(ctx, layer.w3);
|
781 |
+
|
782 |
+
ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
|
783 |
+
gguf_add_tensor(ctx, layer.ffn_norm);
|
784 |
+
}
|
785 |
+
|
786 |
+
gguf_write_to_file(ctx, filename, false);
|
787 |
+
gguf_free(ctx);
|
788 |
+
}
|
789 |
+
|
790 |
+
static struct train_params get_default_train_params() {
|
791 |
+
struct train_params params;
|
792 |
+
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
793 |
+
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
794 |
+
params.fn_train_data = "shakespeare.txt";
|
795 |
+
params.fn_checkpoint_in = "checkpoint.bin";
|
796 |
+
params.fn_checkpoint_out = "checkpoint.bin";
|
797 |
+
params.fn_model_out = "ggml-checkpoint-f32.bin";
|
798 |
+
|
799 |
+
params.seed = -1;
|
800 |
+
|
801 |
+
params.n_ctx = 128;
|
802 |
+
params.n_embd = 256;
|
803 |
+
params.n_mult = 256;
|
804 |
+
params.n_head = 8;
|
805 |
+
params.n_layer = 16;
|
806 |
+
params.n_rotmax = 64;
|
807 |
+
|
808 |
+
params.n_threads = 6;
|
809 |
+
params.n_batch = 8;
|
810 |
+
params.n_examples = 8;
|
811 |
+
params.n_predict = 1024;
|
812 |
+
|
813 |
+
params.print_info_interval = 1;
|
814 |
+
params.print_details_interval = 2;
|
815 |
+
|
816 |
+
params.samples_start_after_nl = false;
|
817 |
+
params.use_adam = true;
|
818 |
+
params.use_flash = true;
|
819 |
+
params.use_scratch = true;
|
820 |
+
|
821 |
+
// only adam
|
822 |
+
params.warmup = 100;
|
823 |
+
params.cos_decay_steps = 1000;
|
824 |
+
params.cos_decay_restart = 1.1f;
|
825 |
+
params.cos_decay_alpha = 0.0f;
|
826 |
+
|
827 |
+
params.lbfgs_n_iter = 16;
|
828 |
+
params.adam_n_iter = 16;
|
829 |
+
params.adam_alpha = 1e-3f;
|
830 |
+
params.adam_decay = 1e-3f;
|
831 |
+
|
832 |
+
params.mem_model_gb = 2;
|
833 |
+
params.mem_compute_gb = 24;
|
834 |
+
params.mem_compute0_gb = 8;
|
835 |
+
params.mem_compute1_gb = 2;
|
836 |
+
|
837 |
+
return params;
|
838 |
+
}
|
839 |
+
|
840 |
+
static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
841 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
842 |
+
fprintf(stderr, "\n");
|
843 |
+
fprintf(stderr, "options:\n");
|
844 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
845 |
+
fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
846 |
+
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
|
847 |
+
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
|
848 |
+
fprintf(stderr, "\n");
|
849 |
+
}
|
850 |
+
|
851 |
+
static bool params_parse(int argc, char ** argv, struct train_params * params) {
|
852 |
+
bool invalid_param = false;
|
853 |
+
bool reqd_param_found = false;
|
854 |
+
std::string arg;
|
855 |
+
struct train_params default_params = get_default_train_params();
|
856 |
+
const std::string arg_prefix = "--";
|
857 |
+
|
858 |
+
for (int i = 1; i < argc; i++) {
|
859 |
+
arg = argv[i];
|
860 |
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
861 |
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
862 |
+
}
|
863 |
+
|
864 |
+
if (arg == "--copy-vocab-from-model") {
|
865 |
+
if (++i >= argc) {
|
866 |
+
invalid_param = true;
|
867 |
+
break;
|
868 |
+
}
|
869 |
+
params->fn_vocab_model = argv[i];
|
870 |
+
} else if (arg == "--llama2c-model") {
|
871 |
+
if (++i >= argc) {
|
872 |
+
invalid_param = true;
|
873 |
+
break;
|
874 |
+
}
|
875 |
+
reqd_param_found = true;
|
876 |
+
params->fn_llama2c_model = argv[i];
|
877 |
+
} else if (arg == "--llama2c-output-model") {
|
878 |
+
if (++i >= argc) {
|
879 |
+
invalid_param = true;
|
880 |
+
break;
|
881 |
+
}
|
882 |
+
params->fn_llama2c_output_model = argv[i];
|
883 |
+
} else if (arg == "-h" || arg == "--help") {
|
884 |
+
print_usage(argc, argv, &default_params);
|
885 |
+
exit(0);
|
886 |
+
} else {
|
887 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
888 |
+
print_usage(argc, argv, &default_params);
|
889 |
+
exit(1);
|
890 |
+
}
|
891 |
+
}
|
892 |
+
if (invalid_param) {
|
893 |
+
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
894 |
+
print_usage(argc, argv, &default_params);
|
895 |
+
exit(1);
|
896 |
+
}
|
897 |
+
if (!reqd_param_found){
|
898 |
+
fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
|
899 |
+
print_usage(argc, argv, &default_params);
|
900 |
+
exit(1);
|
901 |
+
}
|
902 |
+
|
903 |
+
return true;
|
904 |
+
}
|
905 |
+
|
906 |
+
static std::string basename(const std::string &path) {
|
907 |
+
size_t pos = path.find_last_of("/\\");
|
908 |
+
if (pos == std::string::npos) {
|
909 |
+
return path;
|
910 |
+
}
|
911 |
+
return path.substr(pos + 1);
|
912 |
+
}
|
913 |
+
|
914 |
+
int main(int argc, char ** argv) {
|
915 |
+
struct train_params params = get_default_train_params();
|
916 |
+
if (!params_parse(argc, argv, ¶ms)) {
|
917 |
+
return 1;
|
918 |
+
}
|
919 |
+
Config config;
|
920 |
+
TransformerWeights weights = {};
|
921 |
+
{
|
922 |
+
FILE *file = fopen(params.fn_llama2c_model, "rb");
|
923 |
+
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
924 |
+
// read in the config header
|
925 |
+
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
926 |
+
auto shared_weights = config.vocab_size > 0;
|
927 |
+
config.vocab_size = abs(config.vocab_size);
|
928 |
+
|
929 |
+
// read in the Transformer weights
|
930 |
+
malloc_weights(&weights, &config, shared_weights);
|
931 |
+
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
|
932 |
+
fclose(file);
|
933 |
+
}
|
934 |
+
|
935 |
+
struct llama_vocab vocab;
|
936 |
+
load_vocab(params.fn_vocab_model, &config, &vocab);
|
937 |
+
|
938 |
+
struct my_llama_model model;
|
939 |
+
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
940 |
+
model.hparams.n_ctx = params.n_ctx;
|
941 |
+
model.hparams.n_embd = config.dim; //params.n_embd;
|
942 |
+
model.hparams.n_ff = config.hidden_dim;
|
943 |
+
model.hparams.n_mult = 32;//params.n_mult;
|
944 |
+
model.hparams.n_head = config.n_heads; //params.n_head;
|
945 |
+
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
946 |
+
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
|
947 |
+
print_params(&model.hparams);
|
948 |
+
struct ggml_init_params lcparams;
|
949 |
+
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
|
950 |
+
lcparams.mem_buffer = NULL;
|
951 |
+
lcparams.no_alloc = false;
|
952 |
+
|
953 |
+
model.ctx = ggml_init(lcparams);
|
954 |
+
|
955 |
+
init_model(&model);
|
956 |
+
model.name = basename(params.fn_llama2c_model);
|
957 |
+
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
958 |
+
|
959 |
+
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
|
960 |
+
|
961 |
+
ggml_free(model.ctx);
|
962 |
+
return 0;
|
963 |
+
}
|
examples/embd-input/embd-input-lib.cpp
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
-
|
2 |
-
#
|
3 |
-
#define _GNU_SOURCE
|
4 |
-
#endif
|
5 |
-
|
6 |
#include "embd-input.h"
|
7 |
|
8 |
#include <cassert>
|
@@ -23,11 +20,11 @@ extern "C" {
|
|
23 |
struct MyModel* create_mymodel(int argc, char ** argv) {
|
24 |
gpt_params params;
|
25 |
|
26 |
-
if (gpt_params_parse(argc, argv, params)
|
27 |
return nullptr;
|
28 |
}
|
29 |
|
30 |
-
|
31 |
|
32 |
if (params.seed == LLAMA_DEFAULT_SEED) {
|
33 |
params.seed = uint32_t(time(NULL));
|
@@ -167,7 +164,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
|
167 |
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
168 |
|
169 |
// TODO: Apply penalties
|
170 |
-
// float nl_logit = logits[llama_token_nl()];
|
171 |
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
172 |
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
173 |
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
@@ -176,7 +173,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
|
176 |
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
177 |
// last_n_repeat, alpha_frequency, alpha_presence);
|
178 |
// if (!penalize_nl) {
|
179 |
-
// logits[llama_token_nl()] = nl_logit;
|
180 |
// }
|
181 |
|
182 |
if (temp <= 0) {
|
@@ -211,10 +208,10 @@ const char * sampling(struct MyModel * mymodel) {
|
|
211 |
llama_context * ctx = mymodel->ctx;
|
212 |
int id = sampling_id(mymodel);
|
213 |
static std::string ret;
|
214 |
-
if (id == llama_token_eos()) {
|
215 |
ret = "</s>";
|
216 |
} else {
|
217 |
-
ret =
|
218 |
}
|
219 |
eval_id(mymodel, id);
|
220 |
return ret.c_str();
|
|
|
1 |
+
#include "build-info.h"
|
2 |
+
#include "common.h"
|
|
|
|
|
|
|
3 |
#include "embd-input.h"
|
4 |
|
5 |
#include <cassert>
|
|
|
20 |
struct MyModel* create_mymodel(int argc, char ** argv) {
|
21 |
gpt_params params;
|
22 |
|
23 |
+
if (!gpt_params_parse(argc, argv, params)) {
|
24 |
return nullptr;
|
25 |
}
|
26 |
|
27 |
+
print_build_info();
|
28 |
|
29 |
if (params.seed == LLAMA_DEFAULT_SEED) {
|
30 |
params.seed = uint32_t(time(NULL));
|
|
|
164 |
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
165 |
|
166 |
// TODO: Apply penalties
|
167 |
+
// float nl_logit = logits[llama_token_nl(ctx)];
|
168 |
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
169 |
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
170 |
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
|
173 |
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
174 |
// last_n_repeat, alpha_frequency, alpha_presence);
|
175 |
// if (!penalize_nl) {
|
176 |
+
// logits[llama_token_nl(ctx)] = nl_logit;
|
177 |
// }
|
178 |
|
179 |
if (temp <= 0) {
|
|
|
208 |
llama_context * ctx = mymodel->ctx;
|
209 |
int id = sampling_id(mymodel);
|
210 |
static std::string ret;
|
211 |
+
if (id == llama_token_eos(ctx)) {
|
212 |
ret = "</s>";
|
213 |
} else {
|
214 |
+
ret = llama_token_to_piece(ctx, id);
|
215 |
}
|
216 |
eval_id(mymodel, id);
|
217 |
return ret.c_str();
|